root/gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl @ 28665

Revision 28665, 16.9 KB (checked in by jmt12, 6 years ago)

Latest changes to workaround resumed syscalls massive duration problem

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl
2
3# Pragma
4use strict;
5use warnings;
6# Libraries
7use File::Path qw( remove_tree );
8use File::Temp qw( tempdir );
9use Getopt::Long;
10use Time::HiRes qw( gettimeofday tv_interval );
11
12BEGIN
13{
14  print "======================= Greenstone Import + I/O Metrics ======================\n\n";
15  if (!defined $ENV{'GSDLHOME'})
16  {
17    print "Error! GSDLHOME not set\n\n";
18    exit;
19  }
20}
21
22# 1. Parse and sanitize arguments - the listing of syscalls whose duration we
23# accumulate as IO time. Unless otherwise noted, these are taken from what
24# strace_analyzer.pl would use
25print " * Initializing... Done\n";
26our $strace_flags = '-f -q -s 256 -T -ttt';
27our $io_function_list = {
28                         'access'=>1,
29                         'chmod'=>1,
30                         'close'=>1,
31                         'creat'=>1,
32                         'fclose'=>1,
33                         'fcntl'=>1,
34                         'fgetpos'=>1,
35                         'flock'=>1,
36                         'fseek'=>1,
37                         'fsetpos'=>1,
38                         'fstat'=>1,
39                         'fsync'=>1,
40                         'ftell'=>1,
41                         'getdents'=>1,
42                         #'ioctl'=>1, # John added
43                         'llseek'=>1,
44                         'lockf'=>1,
45                         'lseek'=>1,
46                         'lseek64'=>1,
47                         'mkdir'=>1,
48                         'open'=>1,
49                         'read'=>1,
50                         'readdir'=>1,
51                         'rename'=>1,
52                         'rewind'=>1,
53                         'rewinddir'=>1,
54                         'scandir'=>1,
55                         'stat'=>1,
56                         'stat64'=>1,
57                         'telldir'=>1,
58                         'unlink'=>1,
59                         'write'=>1
60                        };
61my $start_time = [gettimeofday()];
62my $collection = '';
63my $debug = 0;
64my $flush_delay = 3;
65
66GetOptions ('collection=s' => \$collection,
67            'debug' => \$debug,
68            'flushdelay=i' => \$flush_delay)
69or &printUsage('Invalid or missing argument');
70
71if (!defined $collection || $collection eq '')
72{
73  &printUsage('Missing collection name');
74}
75my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection;
76if (!-d $collection_path)
77{
78  &printUsage('Collection not found: ' . $collection_path);
79}
80print '   - Greenstone:  ' . $ENV{'GSDLHOME'} . "\n";
81print '   - Collection:  ' . $collection . "\n";
82print '   - Flush Delay: ' . $flush_delay . "\n";
83print '   - Debug?       ' . ($debug ? 'Yes' : 'No') . "\n";
84print "\n";
85
86# 2. Run dummy import command (empty import directory) within trace to
87# determine baseline
88&clearArchives();
89&flushDiskCache($flush_delay);
90print " * Running baseline collection import... ";
91my $dummy_dir = tempdir( CLEANUP => 1);
92my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
93my $debug_path1 = '';
94if ($debug)
95{
96  $debug_path1 = $collection_path . '/debug-baseline.tsv';
97}
98my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
99print "Done\n";
100print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
101print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n";
102print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n";
103print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
104print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n";
105if ($debug)
106{
107  print "   - See 'debug-baseline.tsv' for raw data\n";
108}
109print "\n";
110
111# 3. Run normal import command within strace reading output line by line
112&clearArchives();
113&flushDiskCache($flush_delay);
114print " * Running import and tracking I/O metrics... ";
115my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
116my $debug_path2 = '';
117if ($debug)
118{
119  $debug_path2 = $collection_path . '/debug-import.tsv';
120}
121my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
122print "Done\n";
123print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
124print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
125print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
126print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n";
127print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n";
128print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n";
129print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n";
130print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n";
131print "   - See 'import.log' for Greenstone Import details\n";
132print "   - See 'strace.out' for STrace details\n";
133if ($debug)
134{
135  print "   - See 'debug-import.tsv' for raw data\n";
136}
137print "\n";
138
139# 4. Results
140print &makeHeader('Import Results', 78) . "\n\n";
141print " Import Duration:  " . sprintf('%0.6f', ($duration_import - $baseline_duration)) . " seconds\n";
142print " I/O Duration:     " . sprintf('%0.6f', ($duration_io - $baseline_io)) . " seconds\n";
143print " I/O Percentage:   " . sprintf('%d', ((($duration_io - $baseline_io)/($duration_import - $baseline_duration)) * 100)) . "%\n";
144print "\n";
145
146# Complete!
147my $end_time = [gettimeofday()];
148my $duration = tv_interval($start_time, $end_time);
149print &makeHeader(sprintf('Complete in %0.2f seconds', $duration), 78) . "\n\n";
150
151exit;
152
153
154## @function
155#
156sub makeHeader
157{
158  my ($msg, $length) = @_;
159  if (length($msg) % 2 == 1)
160  {
161    $msg = ' ' . $msg . '  ';
162  }
163  else
164  {
165    $msg = ' ' . $msg . ' ';
166  }
167  while (length($msg) < $length)
168  {
169    $msg = '=' . $msg . '=';
170  }
171  return $msg;
172}
173## makeHeader() ##
174
175
176## @function
177#
178sub clearArchives
179{
180  # 2. Remove any existing archives directory, so it doesn't get factored in the
181  # IO costs
182  my $archives_dir = $collection_path . '/archives';
183  if (-d $archives_dir)
184  {
185    print " * Deleting existing archives directory... ";
186    remove_tree($archives_dir);
187    print "Done\n";
188  }
189}
190## clearArchives()
191
192
193## @function
194#
195sub flushDiskCache
196{
197  my ($flush_delay) = @_;
198  # The Premise: use sudoedit and other black magic to clear out the memory-
199  # based disk cache (which is done by writing the number 3 to a certain
200  # system file)
201  &printFlush(" * Synching file system... ");
202  `sync`;
203  print "Done\n";
204  &printFlush(" * Dropping memory disk cache... ");
205  # - save our current default editor
206  my $current_editor = $ENV{'EDITOR'};
207  # - replace default editor with a script that simply clobbers the contents
208  #   of any file it's handed with the number "3"
209  $ENV{'EDITOR'} = 'reset_memcache_editor.sh';
210  # - we now call sudoedit on the system file. How sudoedit works is that it
211  #   starts by making a temp copy of the system file with appropriate
212  #   permissions allowing the user to edit. It then passes the path to the
213  #   temp file to the default editor - typically this would be an interactive
214  #   editor like 'vi'. However, we've just replaced the editor with a custom
215  #   script that just writes '3' as the content of the tmp file. Finally, when
216  #   the editor exits, sudoedit copies the tmp file over the top of the system
217  #   file, restoring appropriate root-level permissions
218  `sudoedit /proc/sys/vm/drop_caches`;
219  # - restore the default editor, just in case something in Greenstone
220  #   depends on this being a reasonably value
221  $ENV{'EDITOR'} = $current_editor;
222  print "Done\n";
223  # - short break to give OS time to actually notice the drop_caches command
224  &printFlush(" * Waiting for drop_caches to complete... ");
225  while ($flush_delay > 0)
226  {
227    &printFlush($flush_delay . ' ');
228    sleep(1);
229    $flush_delay--;
230  }
231  print "Done\n\n";
232}
233## flushDiskCache()
234
235
236## @function parseStrace
237#
238sub parseStrace
239{
240  my ($command, $logs_path, $debug_path) = @_;
241  my $start_timestamp = 0;
242  my $end_timestamp = 0;
243  my $io_duration = 0;
244  my $misc_duration = 0;
245  my $import_complete = 0;
246  my $documents_found = 0;
247  my $documents_processed = 0;
248  # hash from PIDs to array (FIFO stack) of syscalls and arguments
249  my $interrupted_stacks = {};
250
251  # open the logs if we have been given somewhere to log too
252  my $logging_enabled = 0;
253  if ($logs_path ne '')
254  {
255    my $strace_path = $logs_path . '/strace.out';
256    open(STRACEOUT, '>:utf8', $strace_path) or die("Error! Failed to open file for writing: " . $strace_path);
257    my $gslog_path = $logs_path . '/import.log';
258    open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path);
259    $logging_enabled = 1;
260  }
261
262  # debug mode means writing a whole bunch of information to a CSV
263  if ($debug_path)
264  {
265    open(TSVOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: '  . $logs_path . '/' . $debug);
266    print TSVOUT "TIMESTAMP\tPID\tSYSCALL\tELAPSED\tTOTALIO\tTOTALMISC\tARGS\tRESULT\n";
267  }
268
269  if (open(GSI, $command . '|'))
270  {
271    my $pid = 0;
272    my $line = <GSI>;
273    while ($line)
274    {
275      my $next_line = <GSI>;
276      if ($logging_enabled)
277      {
278        print STRACEOUT $line;
279      }
280      # we may have a line that unfortunately includes newlines in its arguments list
281      # - watch out for unfinished syscalls that will be resumed later
282      # - oh! and signals, cause they are way different anyhow
283      while ($line !~ /<unfinished \.\.\.>/ && $line !~ /SIG/ && $line =~ /\(/ && $line !~ /\)\s+=/)
284      {
285        $line .= ' ' . $next_line;
286        if ($next_line = <GSI>)
287        {
288        }
289        else
290        {
291          print "Error! Failed to find complete arguments list: " . $line . "\n";
292          exit;
293        }
294      }
295      # detect and remove any PID information to make the rest of parsing easier
296      if ($line =~ /^\[pid\s+(\d+)\]\s+(.*)$/s)
297      {
298        my $new_pid = $1;
299        $line = $2;
300        if ($new_pid != $pid)
301        {
302          $pid = $new_pid;
303        }
304      }
305      # detect resume lines, and rewrite so they look like other lines
306      if ($line =~ /^(\d+\.\d+)\s+<\.\.\.\s+([a-z0-9_]+)\s+resumed>\s+(.*)\)\s+=\s+(.+)\s+<(\d+\.\d+)>$/s)
307      {
308        my $timestamp = $1;
309        my $prev_syscall = '';
310        my $syscall = $2;
311        my $args_prefix = '';
312        my $args_suffix = $3;
313        my $result = $4;
314        my $duration = $5;
315        if ($syscall eq 'write' && $args_prefix =~ /^[12],/)
316        {
317          chomp($line);
318          print "\n\nResuming write pid:$pid line:|" . $line . "|\n";
319          # pop the last interrupted syscall off the stack for this pid
320          if (!defined $interrupted_stacks->{$pid} || scalar(@{$interrupted_stacks->{$pid}}) == 0)
321          {
322            print "Error! Resume when stack is empty: " . $pid . "\n";
323            exit;
324          }
325          ($prev_syscall, $args_prefix) = @{pop(@{$interrupted_stacks->{$pid}})};
326          if ($syscall ne $prev_syscall)
327          {
328            print "Error! Resume syscall does not match unfinished syscall: " . $syscall . " != " . $prev_syscall . "\n";
329            exit;
330          }
331        }
332
333        # Time for a hack - duration is highly dubious for suspended
334        # syscalls. It typically includes the total walltime spent doing
335        # anything between when the 'unfinished' and the 'resumed'... time that
336        # is already accounted for in other parsed syscalls.  What I'll do is
337        # limit the resumed calls duration to, at most, the time difference
338        # between this syscall and the next. This is highly inaccurate, of
339        # course, as it includes time spent in userspace but is significantly
340        # better than a elasped duration several times longer than the syscall
341        # actually took.
342        if ($next_line =~ /^.*?(\d+\.\d+)/)
343        {
344          my $next_timestamp = $1;
345          my $timestamp_duration = $next_timestamp - $timestamp;
346          if ($duration > $timestamp_duration)
347          {
348            #rint "[resume: elapsed longer than duration... fudging] ";
349            $duration = $timestamp_duration;
350          }
351        }
352
353        $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . sprintf('%0.6f', $duration) . '>';
354      }
355
356      # SPECIAL CASES
357      # - unfinished syscalls... the only ones we care about are writes to standard out or error as these are echoed to the shell.  Add them to the appropriate stack waiting for the 'resume' rewrite above
358      if ($line =~ /^(\d+\.\d+)\s+([a-z0-9_]+)\((.*?)\s+<unfinished \.\.\.>$/)
359      {
360        my $timestamp = $1;
361        my $syscall = $2;
362        my $args_prefix = $3;
363        if ($syscall eq 'write' && $args_prefix =~ /^[12],/)
364        {
365          print "\n\nPending write pid:$pid line:|" . $line . "|\n";
366          if (!defined $interrupted_stacks->{$pid})
367          {
368            $interrupted_stacks->{$pid} = [];
369          }
370          push(@{$interrupted_stacks->{$pid}}, [$syscall, $args_prefix]);
371        }
372      }
373      # - processes announce detachments and resumptions - maybe this explains
374      # why wait4's keep chaning (-q should shut these up)
375      elsif ($line =~ /^Process (\d+) (detached|resumed)/)
376      {
377        my $local_pid = $1;
378        my $action = $2;
379      }
380      # - exit_group never has a duration
381      elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/)
382      {
383        my $timestamp = $1;
384        my $exit_value = $2;
385        $end_timestamp = $timestamp;
386        if ($debug)
387        {
388          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
389          print TSVOUT $pid . "\t";
390          print TSVOUT "exit_group\t";
391          print TSVOUT "0.000000\t";
392          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
393          print TSVOUT sprintf("%0.6f", $io_duration) . "\t";
394          print TSVOUT $exit_value . "\t";
395          print TSVOUT "?\n";
396        }
397      }
398      # - signals
399      elsif ($line =~ /^(\d+\.\d+)\s+---\s+([A-Z]+)\s+\((.*)\)\s+\@\s+0\s+\(0\)\s+---$/)
400      {
401        my $timestamp = $1;
402        my $signal = $2;
403        my $message = $3;
404        $end_timestamp = $timestamp;
405      }
406      # NORMAL CASE
407      elsif ($line =~ /^(\d+\.\d+)\s+([a-z0-9_]+)\((.*)\)\s+=\s+(.+)\s+<(\d+\.\d+)>$/s)
408      {
409        my $timestamp = $1;
410        my $syscall = $2;
411        my $args = $3;
412        my $result = $4;
413        my $duration = $5;
414        if ($start_timestamp == 0)
415        {
416          $start_timestamp = $timestamp;
417        }
418        $end_timestamp = $timestamp;
419        if (defined $io_function_list->{$syscall})
420        {
421          $io_duration += $duration;
422        }
423        else
424        {
425          $misc_duration += $duration;
426          ##print "[$syscall]";
427        }
428        if ($syscall eq 'write' && $args =~ /[12],\s+"(.*)"/s)
429        {
430          my $payload = $1;
431          $payload =~ s/\\n/\n/g;
432          # parse up some collection import statistics
433          if ($payload =~ /Import complete/)
434          {
435            $import_complete = 1;
436          }
437          elsif ($payload =~ /^\* (\d+) documents were considered/)
438          {
439            $documents_found = $1;
440          }
441          elsif ($payload =~ /^\* (\d+) were processed/)
442          {
443            $documents_processed = $1;
444          }
445          # finally, write these lines out to log if a path has been provided
446          if ($logging_enabled)
447          {
448            print GSLOGOUT $payload;
449          }
450        }
451        if ($debug)
452        {
453          $args =~ s/\r?\n//g;
454          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
455          print TSVOUT $pid . "\t";
456          print TSVOUT $syscall . "\t";
457          print TSVOUT sprintf("%0.6f", $duration) . "\t";
458          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
459          print TSVOUT sprintf("%0.6f", $io_duration) . "\t";
460          print TSVOUT $args . "\t";
461          print TSVOUT $result . "\n";
462        }
463      }
464      else
465      {
466        chomp($line);
467        print "\n\nUnparsed: |" . $line . "|\n\n";
468      }
469      # Move on to the next line... (may be undef)
470      $line = $next_line;
471    }
472  }
473  else
474  {
475    print "Error! Failed to open import process for reading: " . $command . "\n";
476  }
477  if ($logging_enabled)
478  {
479    close(STRACEOUT);
480    close(GSLOGOUT);
481  }
482  if ($debug_path)
483  {
484    close(TSVOUT);
485  }
486  my $cmd_duration = $end_timestamp - $start_timestamp;
487  return ($cmd_duration, $io_duration, $misc_duration, $import_complete, $documents_found, $documents_processed);
488}
489# parseStrace()
490
491
492sub printFlush
493{
494  my ($msg) = @_;
495  $| = 1;
496  print $msg;
497  $| = 0;
498}
499
500
501sub printUsage
502{
503  my ($msg) = @_;
504  if (defined $msg)
505  {
506    print 'Error! ' . $msg . "\n";
507  }
508  print "Usage: import_with_io_metric.pl -collection <collection name> [-debug]\n\n";
509  exit;
510}
511
5121;
Note: See TracBrowser for help on using the browser.