Changeset 28767 for gs2-extensions/parallel-building/trunk
- Timestamp:
- 2013-12-18T10:21:53+13:00 (10 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl
r28665 r28767 5 5 use warnings; 6 6 # Libraries 7 use File::Path qw( remove_tree );8 use File::Temp qw( tempdir );7 use File::Path qw( remove_tree ); 8 use File::Temp qw( tempdir ); 9 9 use Getopt::Long; 10 use Time::HiRes qw( gettimeofday tv_interval ); 10 use POSIX qw( strftime ); 11 use Time::HiRes qw( gettimeofday tv_interval ); 11 12 12 13 BEGIN 13 14 { 14 print " ======================= Greenstone Import + I/O Metrics ======================\n\n";15 print "\n======================= Greenstone Import + I/O Metrics ======================\n\n"; 15 16 if (!defined $ENV{'GSDLHOME'}) 16 17 { … … 19 20 } 20 21 } 22 23 print "Prepare and run a number of collection imports while recording I/O metrics.\n\n"; 21 24 22 25 # 1. Parse and sanitize arguments - the listing of syscalls whose duration we … … 24 27 # strace_analyzer.pl would use 25 28 print " * Initializing... Done\n"; 29 my $machine_name = `hostname -s`; 30 chomp($machine_name); 31 $machine_name = ucfirst($machine_name); 32 my $os_name = `lsb_release -i`; 33 $os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i; 34 my $fs_name = `df -T $ENV{'GSDLHOME'}`; 35 $fs_name =~ s/^.*(ext2|ext3|ext4|xfs|zfs).*$/$1/is; 36 $fs_name = uc($fs_name); 37 my $start_time = [gettimeofday()]; 38 my @collections; 39 my $print_report = 0; 26 40 our $strace_flags = '-f -q -s 256 -T -ttt'; 27 41 our $io_function_list = { … … 40 54 'ftell'=>1, 41 55 'getdents'=>1, 42 #'ioctl'=>1, # John added56 'ioctl'=>1, # John added 43 57 'llseek'=>1, 44 58 'lockf'=>1, … … 59 73 'write'=>1 60 74 }; 61 my $start_time = [gettimeofday()]; 62 my $collection = ''; 63 my $debug = 0; 64 my $flush_delay = 3; 65 66 GetOptions ('collection=s' => \$collection, 75 our $debug = 0; 76 our $flush_delay = 3; 77 our $test_runs = 9; 78 79 GetOptions ('collection=s' => \@collections, 67 80 'debug' => \$debug, 68 'flushdelay=i' => \$flush_delay) 81 'flushdelay=i' => \$flush_delay, 82 'runs=i' => \$test_runs, 83 'report' => \$print_report) 69 84 or &printUsage('Invalid or missing argument'); 70 85 71 if (!defined $collection || $collection eq '') 72 { 73 &printUsage('Missing collection name'); 74 } 75 my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection; 76 if (!-d $collection_path) 77 { 78 &printUsage('Collection not found: ' . $collection_path); 79 } 80 print ' - Greenstone: ' . $ENV{'GSDLHOME'} . "\n"; 81 print ' - Collection: ' . $collection . "\n"; 82 print ' - Flush Delay: ' . $flush_delay . "\n"; 83 print ' - Debug? ' . ($debug ? 'Yes' : 'No') . "\n"; 86 if ($test_runs < 1) 87 { 88 &printUsage('Test runs must be non-zeo'); 89 } 90 my $db_name = 'strace_' . $machine_name . '_' . $os_name . '_' . $fs_name . '.sqlite3db'; 91 my $db_path = $ENV{'GSDLHOME'} . '/collect/' . $db_name; 92 93 if (0 < $print_report) 94 { 95 &printReport($db_path); 96 exit; 97 } 98 99 if (0 == scalar(@collections)) 100 { 101 &printUsage('Missing collection or collections'); 102 } 103 104 print ' - Greenstone: ' . $ENV{'GSDLHOME'} . "\n"; 105 print ' - Collections: ' . join(',', @collections) . "\n"; 106 print ' - Test Runs: ' . $test_runs . "\n"; 107 print ' - Flush Delay: ' . $flush_delay . "\n"; 108 print ' - Debug? ' . ($debug ? 'Yes' : 'No') . "\n"; 84 109 print "\n"; 85 110 86 # 2. Run dummy import command (empty import directory) within trace to 87 # determine baseline 88 &clearArchives(); 89 &flushDiskCache($flush_delay); 90 print " * Running baseline collection import... "; 91 my $dummy_dir = tempdir( CLEANUP => 1); 92 my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; 93 my $debug_path1 = ''; 94 if ($debug) 95 { 96 $debug_path1 = $collection_path . '/debug-baseline.tsv'; 97 } 98 my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); 99 print "Done\n"; 100 print ' - Duration: ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; 101 print ' - SysCall Time: ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n"; 102 print ' - SysCall Prct: ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n"; 103 print ' - I/O Time: ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 104 print ' - I/O Percent: ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n"; 105 if ($debug) 106 { 107 print " - See 'debug-baseline.tsv' for raw data\n"; 108 } 111 # 0. Create database a populate with tests (if necessary) 112 if (!-f $db_path) 113 { 114 print STDOUT ' * Creating database tables... '; 115 execSQL($db_path, 'CREATE TABLE IF NOT EXISTS tests (collection TEXT, test_run INTEGER, test_started INTEGER DEFAULT 0, docs_found INTEGER, docs_processed INTEGER, b_elapsed REAL DEFAULT 0, b_io REAL DEFAULT 0, b_other REAL DEFAULT 0, i_elapsed REAL DEFAULT 0, i_io REAL DEFAULT 0, i_other REAL DEFAULT 0, PRIMARY KEY (collection, test_run))'); 116 print STDOUT "Done\n"; 117 } 118 print ' * Populating database with test runs as necessary... '; 119 my $new_test_count = 0; 120 foreach my $collection (@collections) 121 { 122 for (my $i = 1; $i <= $test_runs; $i++) 123 { 124 my $search_sql = "SELECT COUNT(*) FROM tests WHERE collection='" . $collection . "' AND test_run=" . $i; 125 my $test_count = &getValueSQL($db_path, $search_sql); 126 if (0 == $test_count) 127 { 128 $new_test_count++; 129 my $insert_sql = "INSERT INTO tests (collection, test_run) VALUES ('" . $collection . "'," . $i . ")"; 130 &execSQL($db_path, $insert_sql); 131 } 132 } 133 } 134 print $new_test_count . " tests added\n"; 109 135 print "\n"; 110 136 111 # 3. Run normal import command within strace reading output line by line 112 &clearArchives(); 113 &flushDiskCache($flush_delay); 114 print " * Running import and tracking I/O metrics... "; 115 my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; 116 my $debug_path2 = ''; 117 if ($debug) 118 { 119 $debug_path2 = $collection_path . '/debug-import.tsv'; 120 } 121 my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 122 print "Done\n"; 123 print ' - Import? ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; 124 print ' - Found: ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 125 print ' - Processed: ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 126 print ' - Duration: ' . sprintf('%0.6f', $duration_import) . " seconds\n"; 127 print ' - SysCall Time: ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n"; 128 print ' - SysCall Prct: ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n"; 129 print ' - I/O Duration: ' . sprintf('%0.6f', $duration_io) . " seconds\n"; 130 print ' - I/O Percent: ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n"; 131 print " - See 'import.log' for Greenstone Import details\n"; 132 print " - See 'strace.out' for STrace details\n"; 133 if ($debug) 134 { 135 print " - See 'debug-import.tsv' for raw data\n"; 136 } 137 print "\n"; 138 139 # 4. Results 140 print &makeHeader('Import Results', 78) . "\n\n"; 141 print " Import Duration: " . sprintf('%0.6f', ($duration_import - $baseline_duration)) . " seconds\n"; 142 print " I/O Duration: " . sprintf('%0.6f', ($duration_io - $baseline_io)) . " seconds\n"; 143 print " I/O Percentage: " . sprintf('%d', ((($duration_io - $baseline_io)/($duration_import - $baseline_duration)) * 100)) . "%\n"; 144 print "\n"; 145 146 # Complete! 137 # 1. Continue picking random tests and running them until there are none left 138 my $total_test_count = &getValueSQL($db_path, 'SELECT COUNT(*) FROM tests'); 139 my $remaining_test_sql = 'SELECT COUNT(*) FROM tests WHERE test_started=0'; 140 my $remaining_test_count = getValueSQL($db_path, $remaining_test_sql); 141 my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now'; 142 while ($remaining_test_count ne "0" && !-f $exit_file_path) 143 { 144 my $test_started = time; 145 my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime($test_started); 146 my $random_test_sql = 'SELECT collection, test_run FROM tests WHERE test_started=0 ORDER BY RANDOM() LIMIT 1'; 147 my ($collection, $test_run) = &getRecordSQL($db_path, $random_test_sql); 148 149 print &makeHeader('Running test: ' . $collection . '#' . $test_run, 78) . "\n\n"; 150 print " * Note: to gracefully exit create a file: <gsdlhome>/collect/exit.now\n"; 151 print ' * Started: ' . $now_string . "\n"; 152 153 my $found_result = 0; 154 do 155 { 156 my @results = &importWithStrace($collection); 157 # We don't allow negative results... just try again 158 if ($results[0] != $results[1]) 159 { 160 print "Warning! Failed to process all the documents found - rerunning test...\n"; 161 } 162 elsif (0 >= $results[1]) 163 { 164 print "Warning! Failed to process any documents - rerunning test...\n"; 165 } 166 # I'm not sure how this happens, but it seems on really fast imports they 167 # sometimes happen faster than the baseline import?!? 168 elsif ($results[5] < $results[2]) 169 { 170 print "Warning! Baseline import took longer than actual import - rerunning test...\n"; 171 } 172 elsif ($results[6] < $results[3]) 173 { 174 print "Warning! Spent more time on I/O syscalls in baseline than during import - rerunning test...\n"; 175 } 176 elsif ($results[7] < $results[4]) 177 { 178 print "Warning! Spent more time on non-I/O syscalls in baseline than during import - rerunning test...\n"; 179 } 180 # I'm also seeing a lot of negative percentages caused by the total delta 181 # time spend on io and other syscalls being more than the delta time 182 # between import and baseline durations 183 elsif (($results[5] - $results[2]) < (($results[6] - $results[3]) + ($results[7] - $results[4]))) 184 { 185 print "Warning! Time difference between baseline and import durations less than total time differences between io and misc system calls - rerunning test...\n"; 186 } 187 else 188 { 189 my $update_sql = 'UPDATE tests SET test_started=' . $test_started . ', '; 190 $update_sql .= 'docs_found=' . $results[0] . ', docs_processed=' . $results[1] . ', '; 191 $update_sql .= 'b_elapsed=' . sprintf('%0.6f', $results[2]) . ', '; 192 $update_sql .= 'b_io=' . sprintf('%0.6f', $results[3]) . ', '; 193 $update_sql .= 'b_other=' . sprintf('%0.6f', $results[4]) . ', '; 194 $update_sql .= 'i_elapsed=' . sprintf('%0.6f', $results[5]) . ', '; 195 $update_sql .= 'i_io=' . sprintf('%0.6f', $results[6]) . ', '; 196 $update_sql .= 'i_other=' . sprintf('%0.6f', $results[7]) . ' '; 197 $update_sql .= 'WHERE collection=\'' . $collection . '\' AND test_run=' . $test_run; 198 execSQL($db_path, $update_sql); 199 $found_result = 1; 200 } 201 } 202 while(0 == $found_result); 203 204 # Repeat until we have exhausted pending tests 205 $remaining_test_count = &getValueSQL($db_path, $remaining_test_sql); 206 207 my $x = $total_test_count - $remaining_test_count; 208 print &makeHeader('Test Complete: ' . $collection . '#' . $test_run, 78) . "\n\n"; 209 print ' * ' . sprintf("%.0f",(($x/$total_test_count)*100)) . "% of all tests complete!\n"; 210 print ' * ' . $remaining_test_count . " tests remaining\n"; 211 print "\n"; 212 } 213 214 if (-f $exit_file_path) 215 { 216 unlink($exit_file_path); 217 } 218 219 # 2. Complete! 147 220 my $end_time = [gettimeofday()]; 148 221 my $duration = tv_interval($start_time, $end_time); … … 151 224 exit; 152 225 226 ################################################################################ 227 228 ## @function importWithStrace() 229 # 230 sub importWithStrace 231 { 232 my ($collection) = @_; 233 my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection; 234 if (!-d $collection_path) 235 { 236 &printError('Collection not found: ' . $collection_path); 237 } 238 239 # 2. Run dummy import command (empty import directory) within trace to 240 # determine baseline 241 &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); 242 &flushDiskCache($flush_delay); 243 print " * Running baseline collection import... "; 244 my $dummy_dir = tempdir( CLEANUP => 1); 245 my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; 246 my $debug_path1 = ''; 247 if ($debug) 248 { 249 $debug_path1 = $collection_path . '/debug-baseline.tsv'; 250 } 251 my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); 252 print "Done\n"; 253 print ' - Duration: ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; 254 print " - System Calls Breakdown:\n"; 255 print ' - I/O Duration: ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 256 print ' - I/O Percent: ' . sprintf('%0.1f', ($baseline_io / $baseline_duration) * 100) . "%\n"; 257 print ' - Other Duratn: ' . sprintf('%0.6f', $baseline_misc) . " seconds\n"; 258 print ' - Other Percnt: ' . sprintf('%0.1f', ($baseline_misc / $baseline_duration) * 100) . "%\n"; 259 if ($debug) 260 { 261 print " - See 'debug-baseline.tsv' for raw data\n"; 262 } 263 if (-d $dummy_dir) 264 { 265 rmdir($dummy_dir); 266 } 267 268 # 3. Run normal import command within strace reading output line by line 269 &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); 270 &flushDiskCache($flush_delay); 271 print " * Running import and tracking I/O metrics... "; 272 my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; 273 my $debug_path2 = ''; 274 if ($debug) 275 { 276 $debug_path2 = $collection_path . '/debug-import.tsv'; 277 } 278 my ($import_duration, $import_io, $import_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 279 print "Done\n"; 280 print ' - Import? ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; 281 print ' - Found: ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 282 print ' - Processed: ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 283 print ' - Duration: ' . sprintf('%0.6f', $import_duration) . " seconds\n"; 284 print " - System Calls Breakdown:\n"; 285 print ' - I/O Duration: ' . sprintf('%0.6f', $import_io) . " seconds\n"; 286 print ' - I/O Percent: ' . sprintf('%0.1f', ($import_io / $import_duration) * 100) . "%\n"; 287 print ' - Other Duratn: ' . sprintf('%0.6f', $import_misc) . " seconds\n"; 288 print ' - Other Percnt: ' . sprintf('%0.1f', ($import_misc / $import_duration) * 100) . "%\n"; 289 print " - See 'import.log' for Greenstone Import details\n"; 290 print " - See 'strace.out' for STrace details\n"; 291 if ($debug) 292 { 293 print " - See 'debug-import.tsv' for raw data\n"; 294 } 295 print "\n"; 296 297 # 4. Results 298 my $delta_duration = $import_duration - $baseline_duration; 299 my $delta_io = $import_io - $baseline_io; 300 my $delta_io_percent = ( $delta_io / $delta_duration ) * 100; 301 my $delta_misc = $import_misc - $baseline_misc; 302 my $delta_misc_percent = ( $delta_misc / $delta_duration ) * 100; 303 print &makeHeader('Import Results', 78) . "\n\n"; 304 print ' Import Duration: ' . sprintf('%0.6f', $delta_duration) . " seconds\n"; 305 print " System Calls Breakdown:\n"; 306 print ' - I/O Duration: ' . sprintf('%0.6f', $delta_io) . " seconds\n"; 307 print ' - I/O Percent: ' . sprintf('%0.1f', $delta_io_percent) . "%\n"; 308 print ' - Other Duratn: ' . sprintf('%0.6f', $delta_misc) . " seconds\n"; 309 print ' - Other Percnt: ' . sprintf('%0.1f', $delta_misc_percent) . "%\n"; 310 print "\n"; 311 312 return ($docs_found, $docs_processed, $baseline_duration, $baseline_io, $baseline_misc, $import_duration, $import_io, $import_misc); 313 } 314 ## importWithStrace() 315 316 ## @function makeHeader($msg, [$length]) 317 # 318 # Create a centered header string given a certain message padded with '=' characters. 319 # 320 # @param $msg The message to center as a string 321 # @param $length The desired length of string - defaults to 79 322 # @return A string centered with '=' as padding 323 # 324 sub makeHeader 325 { 326 my ($msg, $length) = @_; 327 if (!defined $length) 328 { 329 $length = 79; # 80 with newline 330 } 331 if (length($msg) % 2 == 0) 332 { 333 $msg .= ' '; 334 } 335 my $filler_length = ($length - length($msg)) / 2; 336 my $filler = '=' x $filler_length; 337 $msg = $filler . ' ' . $msg . ' ' . $filler; 338 return $msg; 339 } 340 ## makeHeader() ## 341 153 342 154 343 ## @function 155 # 156 sub makeHeader 157 { 158 my ($msg, $length) = @_; 159 if (length($msg) % 2 == 1) 160 { 161 $msg = ' ' . $msg . ' '; 162 } 163 else 164 { 165 $msg = ' ' . $msg . ' '; 166 } 167 while (length($msg) < $length) 168 { 169 $msg = '=' . $msg . '='; 170 } 171 return $msg; 172 } 173 ## makeHeader() ## 174 175 176 ## @function 177 # 178 sub clearArchives 179 { 180 # 2. Remove any existing archives directory, so it doesn't get factored in the 181 # IO costs 182 my $archives_dir = $collection_path . '/archives'; 183 if (-d $archives_dir) 184 { 185 print " * Deleting existing archives directory... "; 186 remove_tree($archives_dir); 187 print "Done\n"; 188 } 189 } 190 ## clearArchives() 191 344 # Remove named Greenstone directory so it doesn't get factored in I/O costs 345 # 346 sub clearExistingDirectory 347 { 348 my $collection_path = shift(@_); 349 for my $dirname (@_) 350 { 351 my $dir = $collection_path . '/' . $dirname; 352 if (-d $dir) 353 { 354 print ' * Deleting existing ' . $dirname . ' directory... '; 355 remove_tree($dir); 356 print "Done\n"; 357 } 358 } 359 } 360 ## clearExistingDirectory() 192 361 193 362 ## @function … … 229 398 $flush_delay--; 230 399 } 231 print "Done\n \n";400 print "Done\n"; 232 401 } 233 402 ## flushDiskCache() … … 257 426 my $gslog_path = $logs_path . '/import.log'; 258 427 open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path); 259 $logging_enabled = 1;428 #$logging_enabled = 1; 260 429 } 261 430 … … 278 447 print STRACEOUT $line; 279 448 } 449 450 if ($line =~ /^[^\d\[]+(.*)/) 451 { 452 $line = $1; 453 } 454 280 455 # we may have a line that unfortunately includes newlines in its arguments list 281 456 # - watch out for unfinished syscalls that will be resumed later … … 337 512 # limit the resumed calls duration to, at most, the time difference 338 513 # between this syscall and the next. This is highly inaccurate, of 339 # course, as it includes time spent in userspace but is significantly340 # better than a elasped duration several times longer than the syscall341 # actually took.514 # course, as it excludes time spent before the suspension but then 515 # includes time spent in userspace but is significantly better than a 516 # elasped duration several times longer than the syscall actually took. 342 517 if ($next_line =~ /^.*?(\d+\.\d+)/) 343 518 { … … 363 538 if ($syscall eq 'write' && $args_prefix =~ /^[12],/) 364 539 { 365 print "\n\nPending write pid:$pid line:|" . $line . "|\n";540 #print "\n\nPending write pid:$pid line:|" . $line . "|\n"; 366 541 if (!defined $interrupted_stacks->{$pid}) 367 542 { … … 379 554 } 380 555 # - exit_group never has a duration 381 elsif ($line =~ /^(\d+\.\d+)\s+ exit_group\((\d+)\)\s+=\s+\?$/)556 elsif ($line =~ /^(\d+\.\d+)\s+(_exit|exit_group)\((\d+)\)\s+=\s+\?$/) 382 557 { 383 558 my $timestamp = $1; 384 my $exit_value = $2; 559 my $exit_function = $2; 560 my $exit_value = $3; 385 561 $end_timestamp = $timestamp; 386 562 if ($debug) … … 388 564 print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; 389 565 print TSVOUT $pid . "\t"; 390 print TSVOUT "exit_group\t";566 print TSVOUT $exit_function . "\t"; 391 567 print TSVOUT "0.000000\t"; 392 568 print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; … … 510 686 } 511 687 688 ## @function 689 # 690 sub execSQL 691 { 692 my ($db_path, $sql) = @_; 693 # call getValueSQL but don't care about result 694 getValueSQL($db_path, $sql); 695 } 696 # /** execSQL() **/ 697 698 699 ## @function 700 # 701 sub getRecordSQL 702 { 703 my ($db_path, $sql) = @_; 704 if ($sql !~ /LIMIT 1/i) 705 { 706 $sql .= ' LIMIT 1'; 707 } 708 my $value = getValueSQL($db_path, $sql); 709 return split(/\|/,$value); 710 } 711 # /** getRecordSQL() **/ 712 713 714 ## @function 715 # 716 sub getRecordsSQL 717 { 718 my ($db_path, $sql) = @_; 719 my @records; 720 my $raw_values = getValueSQL($db_path, $sql); 721 foreach my $raw_record (split(/\r?\n/, $raw_values)) 722 { 723 my @record = split(/\|/, $raw_record); 724 push(@records, \@record); 725 } 726 return @records; 727 } 728 ## getRecordsSQL() 729 730 731 ## @function 732 # 733 sub getValueSQL 734 { 735 my ($db_path, $sql) = @_; 736 my $result = `sqlite3 "$db_path" "$sql" 2>&1`; 737 if ($result =~ /Error:/) 738 { 739 die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result); 740 } 741 # trim 742 $result =~ s/^\s*|\s*$//g; 743 return $result; 744 } 745 # /** getValueSQL() **/ 746 747 748 ## @function printReport() 749 # 750 sub printReport 751 { 752 my ($db_path) = @_; 753 # get listing of unique collection names sorted alphabetically. While we are 754 # at it, grab the number of documents processed from pretty much any entry 755 # for this collection (they should all be the same, otherwise they would've 756 # been repeated back in testing) 757 my $collection_sql = 'SELECT collection, AVG(docs_processed) FROM tests GROUP BY collection'; 758 my @collection_records = &getRecordsSQL($db_path, $collection_sql); 759 foreach my $collection_record (@collection_records) 760 { 761 my ($collection, $docs_processed) = @{$collection_record}; 762 # determine the size in bytes of the import directory 763 my $collection_import_directory = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/import'; 764 my $du_command = 'du -bsL "' . $collection_import_directory . '"'; 765 my $du_result = `$du_command`; 766 my $size_in_bytes = 0; 767 if ($du_result =~ /^(\d+)\s+/) 768 { 769 $size_in_bytes = $1; 770 } 771 # pretty print header block for dokuwiki 772 &printReportHeader($collection, $docs_processed, $size_in_bytes); 773 my @test_records = &getRecordsSQL($db_path, 'SELECT * FROM tests WHERE collection=\'' . $collection . '\' AND test_started > 0 ORDER BY test_started'); 774 my $number_of_tests = scalar(@test_records); 775 my $total_b_elapsed = 0; 776 my @b_elapsed_values; 777 my $total_b_io_percent = 0; 778 my @b_io_percent_values; 779 my $total_b_other_percent = 0; 780 my @b_other_percent_values; 781 my $total_d_elapsed = 0; 782 my @d_elapsed_values; 783 my $total_d_io_percent = 0; 784 my @d_io_percent_values; 785 my $total_d_other_percent = 0; 786 my @d_other_percent_values; 787 foreach my $test_record (@test_records) 788 { 789 my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; 790 my $b_io_percent = ($b_io / $b_elapsed) * 100; 791 my $b_other_percent = ($b_other / $b_elapsed) * 100; 792 my $d_elapsed = $i_elapsed - $b_elapsed; 793 my $d_io = $i_io - $b_io; 794 my $d_io_percent = ($d_io / $d_elapsed) * 100; 795 my $d_other = $i_other - $b_other; 796 my $d_other_percent = ($d_other / $d_elapsed) * 100; 797 # add to running totals 798 $total_b_elapsed += $b_elapsed; 799 push(@b_elapsed_values, $b_elapsed); 800 $total_b_io_percent += $b_io_percent; 801 push(@b_io_percent_values, $b_io_percent); 802 $total_b_other_percent += $b_other_percent; 803 push(@b_other_percent_values, $b_other_percent); 804 $total_d_elapsed += $d_elapsed; 805 push(@d_elapsed_values, $d_elapsed); 806 $total_d_io_percent += $d_io_percent; 807 push(@d_io_percent_values, $d_io_percent); 808 $total_d_other_percent += $d_other_percent; 809 push(@d_other_percent_values, $d_other_percent); 810 } 811 # Averages 812 my $b_elapsed_avg = $total_b_elapsed / $number_of_tests; 813 my $b_io_percent_avg = $total_b_io_percent / $number_of_tests; 814 my $b_other_percent_avg = $total_b_other_percent / $number_of_tests; 815 my $d_elapsed_avg = $total_d_elapsed / $number_of_tests; 816 my $d_io_percent_avg = $total_d_io_percent / $number_of_tests; 817 my $d_other_percent_avg = $total_d_other_percent / $number_of_tests; 818 # Standard Deviations 819 my $b_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $b_elapsed_avg, @b_elapsed_values); 820 my $b_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_io_percent_avg, @b_io_percent_values); 821 my $b_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_other_percent_avg, @b_other_percent_values); 822 my $d_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $d_elapsed_avg, @d_elapsed_values); 823 my $d_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_io_percent_avg, @d_io_percent_values); 824 my $d_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_other_percent_avg, @d_other_percent_values); 825 826 my $counter = 0; 827 my $outlier_count = 0; 828 foreach my $test_record (@test_records) 829 { 830 my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; 831 $counter++; 832 my $b_io_percent = ($b_io / $b_elapsed) * 100; 833 my $b_other_percent = ($b_other / $b_elapsed) * 100; 834 my $d_elapsed = $i_elapsed - $b_elapsed; 835 my $d_io = $i_io - $b_io; 836 my $d_io_percent = ($d_io / $d_elapsed) * 100; 837 my $d_other = $i_other - $b_other; 838 my $d_other_percent = ($d_other / $d_elapsed) * 100; 839 print '| ' . $counter . ' | ' . $the_collection . ' | ' . $docs_processed . ' | ' . $size_in_bytes . ' | ' . $test_started . ' | ' . sprintf('%0.6f', $b_elapsed) . ' | ' . sprintf('%0.1f', $b_io_percent) . '% | ' . sprintf('%0.1f', $b_other_percent) . '% | ' . sprintf('%0.6f', $d_elapsed) . ' | ' . sprintf('%0.1f', $d_io_percent) . '% | ' . sprintf('%0.1f', $d_other_percent) . '% | '; 840 if (&isOutlier($d_io_percent, $d_io_percent_avg, $d_io_percent_stddev)) 841 { 842 print 'Yes'; 843 $outlier_count++; 844 } 845 else 846 { 847 print 'No'; 848 } 849 print " |\n"; 850 } 851 print '^ Average ||||| ' . sprintf('%0.6f', $b_elapsed_avg) . ' | ' . sprintf('%0.1f', $b_io_percent_avg) . '% | ' . sprintf('%0.1f', $b_other_percent_avg) . '% | ' . sprintf('%0.6f', $d_elapsed_avg) . ' | ' . sprintf('%0.1f', $d_io_percent_avg) . '% | ' . sprintf('%0.1f', $d_other_percent_avg) . "% |\n"; 852 print '^ Standard Deviation ||||| ' . sprintf('%0.6f', $b_elapsed_stddev) . ' | ' . sprintf('%0.1f', $b_io_percent_stddev) . '% | ' . sprintf('%0.1f', $b_other_percent_stddev) . '% | ' . sprintf('%0.6f', $d_elapsed_stddev) . ' | ' . sprintf('%0.1f', $d_io_percent_stddev) . '% | ' . sprintf('%0.1f', $d_other_percent_stddev) . '% | ' . $outlier_count . " |\n"; 853 print "\n"; 854 } 855 } 856 ## printReport() 857 858 859 ## @function printReportHeader($collection) 860 # 861 sub printReportHeader 862 { 863 my ($collection) = @_; 864 print '==== "' . $collection . "\" Collection ====\n\n"; 865 print "^ Count ^ Collection ^ NumDocs ^ Size ^ Timestamp ^ Base ^^^ Import ^^^ Outlier? ^\n"; 866 print "^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ Elapsed ^ Syscalls ^^ Elapsed ^ Syscalls ^^ ::: ^\n"; 867 print "^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ I/O% ^ Other% ^ ::: ^ I/O% ^ Other ^ ::: ^\n"; 868 } 869 ## printReportHeader() 870 871 872 ## @function calculateStandardDeviation($mean, $value1 ... $valueN) 873 # Our sample size is the population so we can use the simplier standard deviation equation for an entire population (now, theres some rap lyrics) 874 # stddev = squareroot ( sum ( square( value - mean ) ) / count ) 875 sub calculateStandardDeviation 876 { 877 my $population = shift(@_); 878 my $mean = shift(@_); 879 my $count = scalar(@_); 880 my $total = 0; 881 # sum of squares 882 foreach my $value (@_) 883 { 884 my $subtotal = $value - $mean; 885 $total += $subtotal * $subtotal; 886 } 887 if ($population == $count) 888 { 889 $total = $total / $count; 890 } 891 else 892 { 893 $total = $total / ($count - 1); 894 } 895 my $standard_deviation = sqrt($total); 896 return $standard_deviation; 897 } 898 ## calculateStandardDeviation() 899 900 901 ## @function isOutlier($value, $mean, $stddev) 902 # 903 sub isOutlier 904 { 905 my ($value, $mean, $stddev) = @_; 906 my $min = $mean - (2 * $stddev); 907 my $max = $mean + (2 * $stddev); 908 return ($value <= $min || $max <= $value); 909 } 910 ## isOutlier() 911 512 912 1;
Note:
See TracChangeset
for help on using the changeset viewer.