#!/usr/bin/perl # Pragma use strict; use warnings; # Libraries use File::Path qw( remove_tree ); use File::Temp qw( tempdir ); use Getopt::Long; use POSIX qw( strftime ); use Time::HiRes qw( gettimeofday tv_interval ); BEGIN { print "\n======================= Greenstone Import + I/O Metrics ======================\n\n"; if (!defined $ENV{'GSDLHOME'}) { print "Error! GSDLHOME not set\n\n"; exit; } } print "Prepare and run a number of collection imports while recording I/O metrics.\n\n"; # 1. Parse and sanitize arguments - the listing of syscalls whose duration we # accumulate as IO time. Unless otherwise noted, these are taken from what # strace_analyzer.pl would use print " * Initializing... Done\n"; my $machine_name = `hostname -s`; chomp($machine_name); $machine_name = ucfirst($machine_name); my $os_name = `lsb_release -i`; $os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i; my $fs_name = `df -T $ENV{'GSDLHOME'}`; $fs_name =~ s/^.*(ext2|ext3|ext4|xfs|zfs).*$/$1/is; $fs_name = uc($fs_name); my $start_time = [gettimeofday()]; my @collections; my $print_report = 0; our $strace_flags = '-f -q -s 256 -T -ttt'; our $io_function_list = { 'access'=>1, 'chmod'=>1, 'close'=>1, 'creat'=>1, 'fclose'=>1, 'fcntl'=>1, 'fgetpos'=>1, 'flock'=>1, 'fseek'=>1, 'fsetpos'=>1, 'fstat'=>1, 'fsync'=>1, 'ftell'=>1, 'getdents'=>1, 'ioctl'=>1, # John added 'llseek'=>1, 'lockf'=>1, 'lseek'=>1, 'lseek64'=>1, 'mkdir'=>1, 'open'=>1, 'read'=>1, 'readdir'=>1, 'rename'=>1, 'rewind'=>1, 'rewinddir'=>1, 'scandir'=>1, 'stat'=>1, 'stat64'=>1, 'telldir'=>1, 'unlink'=>1, 'write'=>1 }; our $debug = 0; our $flush_delay = 3; our $test_runs = 9; GetOptions ('collection=s' => \@collections, 'debug' => \$debug, 'flushdelay=i' => \$flush_delay, 'runs=i' => \$test_runs, 'report' => \$print_report) or &printUsage('Invalid or missing argument'); if ($test_runs < 1) { &printUsage('Test runs must be non-zeo'); } my $db_name = 'strace_' . $machine_name . '_' . $os_name . '_' . $fs_name . '.sqlite3db'; my $db_path = $ENV{'GSDLHOME'} . '/collect/' . $db_name; if (0 < $print_report) { &printReport($db_path); exit; } if (0 == scalar(@collections)) { &printUsage('Missing collection or collections'); } print ' - Greenstone: ' . $ENV{'GSDLHOME'} . "\n"; print ' - Collections: ' . join(',', @collections) . "\n"; print ' - Test Runs: ' . $test_runs . "\n"; print ' - Flush Delay: ' . $flush_delay . "\n"; print ' - Debug? ' . ($debug ? 'Yes' : 'No') . "\n"; print "\n"; # 0. Create database a populate with tests (if necessary) if (!-f $db_path) { print STDOUT ' * Creating database tables... '; execSQL($db_path, 'CREATE TABLE IF NOT EXISTS tests (collection TEXT, test_run INTEGER, test_started INTEGER DEFAULT 0, docs_found INTEGER, docs_processed INTEGER, b_elapsed REAL DEFAULT 0, b_io REAL DEFAULT 0, b_other REAL DEFAULT 0, i_elapsed REAL DEFAULT 0, i_io REAL DEFAULT 0, i_other REAL DEFAULT 0, PRIMARY KEY (collection, test_run))'); print STDOUT "Done\n"; } print ' * Populating database with test runs as necessary... '; my $new_test_count = 0; foreach my $collection (@collections) { for (my $i = 1; $i <= $test_runs; $i++) { my $search_sql = "SELECT COUNT(*) FROM tests WHERE collection='" . $collection . "' AND test_run=" . $i; my $test_count = &getValueSQL($db_path, $search_sql); if (0 == $test_count) { $new_test_count++; my $insert_sql = "INSERT INTO tests (collection, test_run) VALUES ('" . $collection . "'," . $i . ")"; &execSQL($db_path, $insert_sql); } } } print $new_test_count . " tests added\n"; print "\n"; # 1. Continue picking random tests and running them until there are none left my $total_test_count = &getValueSQL($db_path, 'SELECT COUNT(*) FROM tests'); my $remaining_test_sql = 'SELECT COUNT(*) FROM tests WHERE test_started=0'; my $remaining_test_count = getValueSQL($db_path, $remaining_test_sql); my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now'; while ($remaining_test_count ne "0" && !-f $exit_file_path) { my $test_started = time; my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime($test_started); my $random_test_sql = 'SELECT collection, test_run FROM tests WHERE test_started=0 ORDER BY RANDOM() LIMIT 1'; my ($collection, $test_run) = &getRecordSQL($db_path, $random_test_sql); print &makeHeader('Running test: ' . $collection . '#' . $test_run, 78) . "\n\n"; print " * Note: to gracefully exit create a file: /collect/exit.now\n"; print ' * Started: ' . $now_string . "\n"; my $found_result = 0; do { my @results = &importWithStrace($collection); # We don't allow negative results... just try again if ($results[0] != $results[1]) { print "Warning! Failed to process all the documents found - rerunning test...\n"; } elsif (0 >= $results[1]) { print "Warning! Failed to process any documents - rerunning test...\n"; } # I'm not sure how this happens, but it seems on really fast imports they # sometimes happen faster than the baseline import?!? elsif ($results[5] < $results[2]) { print "Warning! Baseline import took longer than actual import - rerunning test...\n"; } elsif ($results[6] < $results[3]) { print "Warning! Spent more time on I/O syscalls in baseline than during import - rerunning test...\n"; } elsif ($results[7] < $results[4]) { print "Warning! Spent more time on non-I/O syscalls in baseline than during import - rerunning test...\n"; } # I'm also seeing a lot of negative percentages caused by the total delta # time spend on io and other syscalls being more than the delta time # between import and baseline durations elsif (($results[5] - $results[2]) < (($results[6] - $results[3]) + ($results[7] - $results[4]))) { print "Warning! Time difference between baseline and import durations less than total time differences between io and misc system calls - rerunning test...\n"; } else { my $update_sql = 'UPDATE tests SET test_started=' . $test_started . ', '; $update_sql .= 'docs_found=' . $results[0] . ', docs_processed=' . $results[1] . ', '; $update_sql .= 'b_elapsed=' . sprintf('%0.6f', $results[2]) . ', '; $update_sql .= 'b_io=' . sprintf('%0.6f', $results[3]) . ', '; $update_sql .= 'b_other=' . sprintf('%0.6f', $results[4]) . ', '; $update_sql .= 'i_elapsed=' . sprintf('%0.6f', $results[5]) . ', '; $update_sql .= 'i_io=' . sprintf('%0.6f', $results[6]) . ', '; $update_sql .= 'i_other=' . sprintf('%0.6f', $results[7]) . ' '; $update_sql .= 'WHERE collection=\'' . $collection . '\' AND test_run=' . $test_run; execSQL($db_path, $update_sql); $found_result = 1; } } while(0 == $found_result); # Repeat until we have exhausted pending tests $remaining_test_count = &getValueSQL($db_path, $remaining_test_sql); my $x = $total_test_count - $remaining_test_count; print &makeHeader('Test Complete: ' . $collection . '#' . $test_run, 78) . "\n\n"; print ' * ' . sprintf("%.0f",(($x/$total_test_count)*100)) . "% of all tests complete!\n"; print ' * ' . $remaining_test_count . " tests remaining\n"; print "\n"; } if (-f $exit_file_path) { unlink($exit_file_path); } # 2. Complete! my $end_time = [gettimeofday()]; my $duration = tv_interval($start_time, $end_time); print &makeHeader(sprintf('Complete in %0.2f seconds', $duration), 78) . "\n\n"; exit; ################################################################################ ## @function importWithStrace() # sub importWithStrace { my ($collection) = @_; my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection; if (!-d $collection_path) { &printError('Collection not found: ' . $collection_path); } # 2. Run dummy import command (empty import directory) within trace to # determine baseline &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); &flushDiskCache($flush_delay); print " * Running baseline collection import... "; my $dummy_dir = tempdir( CLEANUP => 1); my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; my $debug_path1 = ''; if ($debug) { $debug_path1 = $collection_path . '/debug-baseline.tsv'; } my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); print "Done\n"; print ' - Duration: ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; print " - System Calls Breakdown:\n"; print ' - I/O Duration: ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; print ' - I/O Percent: ' . sprintf('%0.1f', ($baseline_io / $baseline_duration) * 100) . "%\n"; print ' - Other Duratn: ' . sprintf('%0.6f', $baseline_misc) . " seconds\n"; print ' - Other Percnt: ' . sprintf('%0.1f', ($baseline_misc / $baseline_duration) * 100) . "%\n"; if ($debug) { print " - See 'debug-baseline.tsv' for raw data\n"; } if (-d $dummy_dir) { rmdir($dummy_dir); } # 3. Run normal import command within strace reading output line by line &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); &flushDiskCache($flush_delay); print " * Running import and tracking I/O metrics... "; my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; my $debug_path2 = ''; if ($debug) { $debug_path2 = $collection_path . '/debug-import.tsv'; } my ($import_duration, $import_io, $import_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); print "Done\n"; print ' - Import? ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; print ' - Found: ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; print ' - Processed: ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; print ' - Duration: ' . sprintf('%0.6f', $import_duration) . " seconds\n"; print " - System Calls Breakdown:\n"; print ' - I/O Duration: ' . sprintf('%0.6f', $import_io) . " seconds\n"; print ' - I/O Percent: ' . sprintf('%0.1f', ($import_io / $import_duration) * 100) . "%\n"; print ' - Other Duratn: ' . sprintf('%0.6f', $import_misc) . " seconds\n"; print ' - Other Percnt: ' . sprintf('%0.1f', ($import_misc / $import_duration) * 100) . "%\n"; print " - See 'import.log' for Greenstone Import details\n"; print " - See 'strace.out' for STrace details\n"; if ($debug) { print " - See 'debug-import.tsv' for raw data\n"; } print "\n"; # 4. Results my $delta_duration = $import_duration - $baseline_duration; my $delta_io = $import_io - $baseline_io; my $delta_io_percent = ( $delta_io / $delta_duration ) * 100; my $delta_misc = $import_misc - $baseline_misc; my $delta_misc_percent = ( $delta_misc / $delta_duration ) * 100; print &makeHeader('Import Results', 78) . "\n\n"; print ' Import Duration: ' . sprintf('%0.6f', $delta_duration) . " seconds\n"; print " System Calls Breakdown:\n"; print ' - I/O Duration: ' . sprintf('%0.6f', $delta_io) . " seconds\n"; print ' - I/O Percent: ' . sprintf('%0.1f', $delta_io_percent) . "%\n"; print ' - Other Duratn: ' . sprintf('%0.6f', $delta_misc) . " seconds\n"; print ' - Other Percnt: ' . sprintf('%0.1f', $delta_misc_percent) . "%\n"; print "\n"; return ($docs_found, $docs_processed, $baseline_duration, $baseline_io, $baseline_misc, $import_duration, $import_io, $import_misc); } ## importWithStrace() ## @function makeHeader($msg, [$length]) # # Create a centered header string given a certain message padded with '=' characters. # # @param $msg The message to center as a string # @param $length The desired length of string - defaults to 79 # @return A string centered with '=' as padding # sub makeHeader { my ($msg, $length) = @_; if (!defined $length) { $length = 79; # 80 with newline } if (length($msg) % 2 == 0) { $msg .= ' '; } my $filler_length = ($length - length($msg)) / 2; my $filler = '=' x $filler_length; $msg = $filler . ' ' . $msg . ' ' . $filler; return $msg; } ## makeHeader() ## ## @function # Remove named Greenstone directory so it doesn't get factored in I/O costs # sub clearExistingDirectory { my $collection_path = shift(@_); for my $dirname (@_) { my $dir = $collection_path . '/' . $dirname; if (-d $dir) { print ' * Deleting existing ' . $dirname . ' directory... '; remove_tree($dir); print "Done\n"; } } } ## clearExistingDirectory() ## @function # sub flushDiskCache { my ($flush_delay) = @_; # The Premise: use sudoedit and other black magic to clear out the memory- # based disk cache (which is done by writing the number 3 to a certain # system file) &printFlush(" * Synching file system... "); `sync`; print "Done\n"; &printFlush(" * Dropping memory disk cache... "); # - save our current default editor my $current_editor = $ENV{'EDITOR'}; # - replace default editor with a script that simply clobbers the contents # of any file it's handed with the number "3" $ENV{'EDITOR'} = 'reset_memcache_editor.sh'; # - we now call sudoedit on the system file. How sudoedit works is that it # starts by making a temp copy of the system file with appropriate # permissions allowing the user to edit. It then passes the path to the # temp file to the default editor - typically this would be an interactive # editor like 'vi'. However, we've just replaced the editor with a custom # script that just writes '3' as the content of the tmp file. Finally, when # the editor exits, sudoedit copies the tmp file over the top of the system # file, restoring appropriate root-level permissions `sudoedit /proc/sys/vm/drop_caches`; # - restore the default editor, just in case something in Greenstone # depends on this being a reasonably value $ENV{'EDITOR'} = $current_editor; print "Done\n"; # - short break to give OS time to actually notice the drop_caches command &printFlush(" * Waiting for drop_caches to complete... "); while ($flush_delay > 0) { &printFlush($flush_delay . ' '); sleep(1); $flush_delay--; } print "Done\n"; } ## flushDiskCache() ## @function parseStrace # sub parseStrace { my ($command, $logs_path, $debug_path) = @_; my $start_timestamp = 0; my $end_timestamp = 0; my $io_duration = 0; my $misc_duration = 0; my $import_complete = 0; my $documents_found = 0; my $documents_processed = 0; # hash from PIDs to array (FIFO stack) of syscalls and arguments my $interrupted_stacks = {}; # open the logs if we have been given somewhere to log too my $logging_enabled = 0; if ($logs_path ne '') { my $strace_path = $logs_path . '/strace.out'; open(STRACEOUT, '>:utf8', $strace_path) or die("Error! Failed to open file for writing: " . $strace_path); my $gslog_path = $logs_path . '/import.log'; open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path); #$logging_enabled = 1; } # debug mode means writing a whole bunch of information to a CSV if ($debug_path) { open(TSVOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: ' . $logs_path . '/' . $debug); print TSVOUT "TIMESTAMP\tPID\tSYSCALL\tELAPSED\tTOTALIO\tTOTALMISC\tARGS\tRESULT\n"; } if (open(GSI, $command . '|')) { my $pid = 0; my $line = ; while ($line) { my $next_line = ; if ($logging_enabled) { print STRACEOUT $line; } if ($line =~ /^[^\d\[]+(.*)/) { $line = $1; } # we may have a line that unfortunately includes newlines in its arguments list # - watch out for unfinished syscalls that will be resumed later # - oh! and signals, cause they are way different anyhow while ($line !~ // && $line !~ /SIG/ && $line =~ /\(/ && $line !~ /\)\s+=/) { $line .= ' ' . $next_line; if ($next_line = ) { } else { print "Error! Failed to find complete arguments list: " . $line . "\n"; exit; } } # detect and remove any PID information to make the rest of parsing easier if ($line =~ /^\[pid\s+(\d+)\]\s+(.*)$/s) { my $new_pid = $1; $line = $2; if ($new_pid != $pid) { $pid = $new_pid; } } # detect resume lines, and rewrite so they look like other lines if ($line =~ /^(\d+\.\d+)\s+<\.\.\.\s+([a-z0-9_]+)\s+resumed>\s+(.*)\)\s+=\s+(.+)\s+<(\d+\.\d+)>$/s) { my $timestamp = $1; my $prev_syscall = ''; my $syscall = $2; my $args_prefix = ''; my $args_suffix = $3; my $result = $4; my $duration = $5; if ($syscall eq 'write' && $args_prefix =~ /^[12],/) { chomp($line); print "\n\nResuming write pid:$pid line:|" . $line . "|\n"; # pop the last interrupted syscall off the stack for this pid if (!defined $interrupted_stacks->{$pid} || scalar(@{$interrupted_stacks->{$pid}}) == 0) { print "Error! Resume when stack is empty: " . $pid . "\n"; exit; } ($prev_syscall, $args_prefix) = @{pop(@{$interrupted_stacks->{$pid}})}; if ($syscall ne $prev_syscall) { print "Error! Resume syscall does not match unfinished syscall: " . $syscall . " != " . $prev_syscall . "\n"; exit; } } # Time for a hack - duration is highly dubious for suspended # syscalls. It typically includes the total walltime spent doing # anything between when the 'unfinished' and the 'resumed'... time that # is already accounted for in other parsed syscalls. What I'll do is # limit the resumed calls duration to, at most, the time difference # between this syscall and the next. This is highly inaccurate, of # course, as it excludes time spent before the suspension but then # includes time spent in userspace but is significantly better than a # elasped duration several times longer than the syscall actually took. if ($next_line =~ /^.*?(\d+\.\d+)/) { my $next_timestamp = $1; my $timestamp_duration = $next_timestamp - $timestamp; if ($duration > $timestamp_duration) { #rint "[resume: elapsed longer than duration... fudging] "; $duration = $timestamp_duration; } } $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . sprintf('%0.6f', $duration) . '>'; } # SPECIAL CASES # - unfinished syscalls... the only ones we care about are writes to standard out or error as these are echoed to the shell. Add them to the appropriate stack waiting for the 'resume' rewrite above if ($line =~ /^(\d+\.\d+)\s+([a-z0-9_]+)\((.*?)\s+$/) { my $timestamp = $1; my $syscall = $2; my $args_prefix = $3; if ($syscall eq 'write' && $args_prefix =~ /^[12],/) { #print "\n\nPending write pid:$pid line:|" . $line . "|\n"; if (!defined $interrupted_stacks->{$pid}) { $interrupted_stacks->{$pid} = []; } push(@{$interrupted_stacks->{$pid}}, [$syscall, $args_prefix]); } } # - processes announce detachments and resumptions - maybe this explains # why wait4's keep chaning (-q should shut these up) elsif ($line =~ /^Process (\d+) (detached|resumed)/) { my $local_pid = $1; my $action = $2; } # - exit_group never has a duration elsif ($line =~ /^(\d+\.\d+)\s+(_exit|exit_group)\((\d+)\)\s+=\s+\?$/) { my $timestamp = $1; my $exit_function = $2; my $exit_value = $3; $end_timestamp = $timestamp; if ($debug) { print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; print TSVOUT $pid . "\t"; print TSVOUT $exit_function . "\t"; print TSVOUT "0.000000\t"; print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; print TSVOUT sprintf("%0.6f", $io_duration) . "\t"; print TSVOUT $exit_value . "\t"; print TSVOUT "?\n"; } } # - signals elsif ($line =~ /^(\d+\.\d+)\s+---\s+([A-Z]+)\s+\((.*)\)\s+\@\s+0\s+\(0\)\s+---$/) { my $timestamp = $1; my $signal = $2; my $message = $3; $end_timestamp = $timestamp; } # NORMAL CASE elsif ($line =~ /^(\d+\.\d+)\s+([a-z0-9_]+)\((.*)\)\s+=\s+(.+)\s+<(\d+\.\d+)>$/s) { my $timestamp = $1; my $syscall = $2; my $args = $3; my $result = $4; my $duration = $5; if ($start_timestamp == 0) { $start_timestamp = $timestamp; } $end_timestamp = $timestamp; if (defined $io_function_list->{$syscall}) { $io_duration += $duration; } else { $misc_duration += $duration; ##print "[$syscall]"; } if ($syscall eq 'write' && $args =~ /[12],\s+"(.*)"/s) { my $payload = $1; $payload =~ s/\\n/\n/g; # parse up some collection import statistics if ($payload =~ /Import complete/) { $import_complete = 1; } elsif ($payload =~ /^\* (\d+) documents were considered/) { $documents_found = $1; } elsif ($payload =~ /^\* (\d+) were processed/) { $documents_processed = $1; } # finally, write these lines out to log if a path has been provided if ($logging_enabled) { print GSLOGOUT $payload; } } if ($debug) { $args =~ s/\r?\n//g; print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; print TSVOUT $pid . "\t"; print TSVOUT $syscall . "\t"; print TSVOUT sprintf("%0.6f", $duration) . "\t"; print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; print TSVOUT sprintf("%0.6f", $io_duration) . "\t"; print TSVOUT $args . "\t"; print TSVOUT $result . "\n"; } } else { chomp($line); print "\n\nUnparsed: |" . $line . "|\n\n"; } # Move on to the next line... (may be undef) $line = $next_line; } } else { print "Error! Failed to open import process for reading: " . $command . "\n"; } if ($logging_enabled) { close(STRACEOUT); close(GSLOGOUT); } if ($debug_path) { close(TSVOUT); } my $cmd_duration = $end_timestamp - $start_timestamp; return ($cmd_duration, $io_duration, $misc_duration, $import_complete, $documents_found, $documents_processed); } # parseStrace() sub printFlush { my ($msg) = @_; $| = 1; print $msg; $| = 0; } sub printUsage { my ($msg) = @_; if (defined $msg) { print 'Error! ' . $msg . "\n"; } print "Usage: import_with_io_metric.pl -collection [-debug]\n\n"; exit; } ## @function # sub execSQL { my ($db_path, $sql) = @_; # call getValueSQL but don't care about result getValueSQL($db_path, $sql); } # /** execSQL() **/ ## @function # sub getRecordSQL { my ($db_path, $sql) = @_; if ($sql !~ /LIMIT 1/i) { $sql .= ' LIMIT 1'; } my $value = getValueSQL($db_path, $sql); return split(/\|/,$value); } # /** getRecordSQL() **/ ## @function # sub getRecordsSQL { my ($db_path, $sql) = @_; my @records; my $raw_values = getValueSQL($db_path, $sql); foreach my $raw_record (split(/\r?\n/, $raw_values)) { my @record = split(/\|/, $raw_record); push(@records, \@record); } return @records; } ## getRecordsSQL() ## @function # sub getValueSQL { my ($db_path, $sql) = @_; my $result = `sqlite3 "$db_path" "$sql" 2>&1`; if ($result =~ /Error:/) { die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result); } # trim $result =~ s/^\s*|\s*$//g; return $result; } # /** getValueSQL() **/ ## @function printReport() # sub printReport { my ($db_path) = @_; # get listing of unique collection names sorted alphabetically. While we are # at it, grab the number of documents processed from pretty much any entry # for this collection (they should all be the same, otherwise they would've # been repeated back in testing) my $collection_sql = 'SELECT collection, AVG(docs_processed) FROM tests GROUP BY collection'; my @collection_records = &getRecordsSQL($db_path, $collection_sql); foreach my $collection_record (@collection_records) { my ($collection, $docs_processed) = @{$collection_record}; # determine the size in bytes of the import directory my $collection_import_directory = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/import'; my $du_command = 'du -bsL "' . $collection_import_directory . '"'; my $du_result = `$du_command`; my $size_in_bytes = 0; if ($du_result =~ /^(\d+)\s+/) { $size_in_bytes = $1; } # pretty print header block for dokuwiki &printReportHeader($collection, $docs_processed, $size_in_bytes); my @test_records = &getRecordsSQL($db_path, 'SELECT * FROM tests WHERE collection=\'' . $collection . '\' AND test_started > 0 ORDER BY test_started'); my $number_of_tests = scalar(@test_records); my $total_b_elapsed = 0; my @b_elapsed_values; my $total_b_io_percent = 0; my @b_io_percent_values; my $total_b_other_percent = 0; my @b_other_percent_values; my $total_d_elapsed = 0; my @d_elapsed_values; my $total_d_io_percent = 0; my @d_io_percent_values; my $total_d_other_percent = 0; my @d_other_percent_values; foreach my $test_record (@test_records) { my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; my $b_io_percent = ($b_io / $b_elapsed) * 100; my $b_other_percent = ($b_other / $b_elapsed) * 100; my $d_elapsed = $i_elapsed - $b_elapsed; my $d_io = $i_io - $b_io; my $d_io_percent = ($d_io / $d_elapsed) * 100; my $d_other = $i_other - $b_other; my $d_other_percent = ($d_other / $d_elapsed) * 100; # add to running totals $total_b_elapsed += $b_elapsed; push(@b_elapsed_values, $b_elapsed); $total_b_io_percent += $b_io_percent; push(@b_io_percent_values, $b_io_percent); $total_b_other_percent += $b_other_percent; push(@b_other_percent_values, $b_other_percent); $total_d_elapsed += $d_elapsed; push(@d_elapsed_values, $d_elapsed); $total_d_io_percent += $d_io_percent; push(@d_io_percent_values, $d_io_percent); $total_d_other_percent += $d_other_percent; push(@d_other_percent_values, $d_other_percent); } # Averages my $b_elapsed_avg = $total_b_elapsed / $number_of_tests; my $b_io_percent_avg = $total_b_io_percent / $number_of_tests; my $b_other_percent_avg = $total_b_other_percent / $number_of_tests; my $d_elapsed_avg = $total_d_elapsed / $number_of_tests; my $d_io_percent_avg = $total_d_io_percent / $number_of_tests; my $d_other_percent_avg = $total_d_other_percent / $number_of_tests; # Standard Deviations my $b_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $b_elapsed_avg, @b_elapsed_values); my $b_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_io_percent_avg, @b_io_percent_values); my $b_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_other_percent_avg, @b_other_percent_values); my $d_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $d_elapsed_avg, @d_elapsed_values); my $d_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_io_percent_avg, @d_io_percent_values); my $d_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_other_percent_avg, @d_other_percent_values); my $counter = 0; my $outlier_count = 0; foreach my $test_record (@test_records) { my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; $counter++; my $b_io_percent = ($b_io / $b_elapsed) * 100; my $b_other_percent = ($b_other / $b_elapsed) * 100; my $d_elapsed = $i_elapsed - $b_elapsed; my $d_io = $i_io - $b_io; my $d_io_percent = ($d_io / $d_elapsed) * 100; my $d_other = $i_other - $b_other; my $d_other_percent = ($d_other / $d_elapsed) * 100; print '| ' . $counter . ' | ' . $the_collection . ' | ' . $docs_processed . ' | ' . $size_in_bytes . ' | ' . $test_started . ' | ' . sprintf('%0.6f', $b_elapsed) . ' | ' . sprintf('%0.1f', $b_io_percent) . '% | ' . sprintf('%0.1f', $b_other_percent) . '% | ' . sprintf('%0.6f', $d_elapsed) . ' | ' . sprintf('%0.1f', $d_io_percent) . '% | ' . sprintf('%0.1f', $d_other_percent) . '% | '; if (&isOutlier($d_io_percent, $d_io_percent_avg, $d_io_percent_stddev)) { print 'Yes'; $outlier_count++; } else { print 'No'; } print " |\n"; } print '^ Average ||||| ' . sprintf('%0.6f', $b_elapsed_avg) . ' | ' . sprintf('%0.1f', $b_io_percent_avg) . '% | ' . sprintf('%0.1f', $b_other_percent_avg) . '% | ' . sprintf('%0.6f', $d_elapsed_avg) . ' | ' . sprintf('%0.1f', $d_io_percent_avg) . '% | ' . sprintf('%0.1f', $d_other_percent_avg) . "% |\n"; print '^ Standard Deviation ||||| ' . sprintf('%0.6f', $b_elapsed_stddev) . ' | ' . sprintf('%0.1f', $b_io_percent_stddev) . '% | ' . sprintf('%0.1f', $b_other_percent_stddev) . '% | ' . sprintf('%0.6f', $d_elapsed_stddev) . ' | ' . sprintf('%0.1f', $d_io_percent_stddev) . '% | ' . sprintf('%0.1f', $d_other_percent_stddev) . '% | ' . $outlier_count . " |\n"; print "\n"; } } ## printReport() ## @function printReportHeader($collection) # sub printReportHeader { my ($collection) = @_; print '==== "' . $collection . "\" Collection ====\n\n"; print "^ Count ^ Collection ^ NumDocs ^ Size ^ Timestamp ^ Base ^^^ Import ^^^ Outlier? ^\n"; print "^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ Elapsed ^ Syscalls ^^ Elapsed ^ Syscalls ^^ ::: ^\n"; print "^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ ::: ^ I/O% ^ Other% ^ ::: ^ I/O% ^ Other ^ ::: ^\n"; } ## printReportHeader() ## @function calculateStandardDeviation($mean, $value1 ... $valueN) # Our sample size is the population so we can use the simplier standard deviation equation for an entire population (now, theres some rap lyrics) # stddev = squareroot ( sum ( square( value - mean ) ) / count ) sub calculateStandardDeviation { my $population = shift(@_); my $mean = shift(@_); my $count = scalar(@_); my $total = 0; # sum of squares foreach my $value (@_) { my $subtotal = $value - $mean; $total += $subtotal * $subtotal; } if ($population == $count) { $total = $total / $count; } else { $total = $total / ($count - 1); } my $standard_deviation = sqrt($total); return $standard_deviation; } ## calculateStandardDeviation() ## @function isOutlier($value, $mean, $stddev) # sub isOutlier { my ($value, $mean, $stddev) = @_; my $min = $mean - (2 * $stddev); my $max = $mean + (2 * $stddev); return ($value <= $min || $max <= $value); } ## isOutlier() 1;