Changeset 27551


Ignore:
Timestamp:
2013-06-05T13:07:43+12:00 (11 years ago)
Author:
jmt12
Message:

Altered so that it expects to be given a CSV containing parallel processing information (which can come from OpenMPI or Hadoop Greenstone imports). Also now determine the import dir by finding the longest common directory path in the files processed

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/generate_gantt.pl

    r27543 r27551  
    99print "\n===== Generate Timing (GANTT) =====\n";
    1010
     11# 0. Configuration
     12my $debug = 0;
     13my $import_dir;
     14
    1115# 1. Initialization
    12 #my $chart_width = 1024;
    13 my $chart_width = 1536;
    14 #my $chart_width = 2048;
     16if (!defined $ARGV[0] || !-d $ARGV[0])
     17{
     18  &printUsage('Directory not provided or doesn\'t exist');
     19}
     20my $dir = $ARGV[0];
     21my $timing_csv_path = &filenameCat($dir, 'timing.csv');
     22if (!-e $timing_csv_path)
     23{
     24  &printUsage('Directory doesn\'t contain timing.csv: ' . $dir);
     25}
     26print 'Timing File: ' . $timing_csv_path . "\n";
     27my $chart_width = 1024;
     28if (defined $ARGV[1])
     29{
     30  if ($ARGV[1] !~ /^\d+$/)
     31  {
     32    &printUsage('Chart width not a number');
     33  }
     34  $chart_width = $ARGV[1];
     35}
    1536print "Chart Width: " . $chart_width . "px\n";
    16 # 1.1 Store all information extracted in a cool data structure
    17 # - N = hostname, S = thread start, E = thread end
    18 my $timing_data = {'M' => {'N'=>'', 'S'=>0, 'E'=>0}};
    19 # 1.2 Check the file exists
    20 if (!-f $ARGV[0])
    21 {
    22   die("Error! File can't be read: " . $ARGV[0]);
    23 }
    24 my $main_log_filename = $ARGV[0];
    25 # 1.2 From the filename we can parse in some information like the number of worker threads
    26 my $number_of_workers = 0;
    27 if ($main_log_filename =~ /\-W(\d+)E/)
    28 {
    29   $number_of_workers = $1;
     37print "===================================\n\n";
     38
     39# Read in timing.csv and parse information into data structure
     40my $timing_data = {};
     41my $id_2_worker_id = {};
     42if (open(TIN, '<:utf8', $timing_csv_path))
     43{
     44  my $line;
     45  while ($line = <TIN>)
     46  {
     47    my @parts = split(/,/, $line);
     48    if ($parts[1] eq 'M0')
     49    {
     50      $timing_data->{'M'} = {'N'=>$parts[2], 'S'=>$parts[3], 'E'=>$parts[4]};
     51    }
     52    elsif ($parts[1] =~ /W\d+/)
     53    {
     54      $timing_data->{$parts[1]} = {'N'=>$parts[2], 'S'=>$parts[3], 'E'=>$parts[4], 'F'=>{}};
     55      $id_2_worker_id->{$parts[0]} = $parts[1];
     56    }
     57    elsif ($parts[1] =~ /T\d+/)
     58    {
     59      my $worker_id = $id_2_worker_id->{$parts[7]};
     60      my $stop = $parts[4];
     61      my $filepath = $parts[8];
     62      $import_dir = &longestCommonPath($filepath, $import_dir);
     63      $timing_data->{$worker_id}->{'F'}->{$parts[3]} = {'FN'=>$filepath, 'PS'=>($stop - $parts[5]), 'PE'=>$stop, 'E'=>$stop};
     64    }
     65  }
     66  close(TIN);
    3067}
    3168else
    3269{
    33   die("Error! Malformed filename (expecting number of workers): " . $main_log_filename);
    34 }
    35 # 1.3 Initialize the data structure with the number of workers too (we don't
    36 #     know the number of files yet, so they'll have to be adding on the fly)
    37 for (my $i = 1; $i <= $number_of_workers; $i++)
    38 {
    39   $timing_data->{'W' . $i} = {'N'=>'', 'S'=>0, 'E'=>0, 'F'=>{}};
    40 }
    41 
    42 # 2. Read in main log file
    43 print " * Reading main log: " . $main_log_filename . "\n";
    44 open(LOGIN, '<:utf8', $main_log_filename) or die("Error! Failed to open file for reading: " . $main_log_filename);
    45 my $line = '';
    46 my $currently_processing = {};
    47 while ($line = <LOGIN>)
    48 {
    49   # 2.1 Parse in the Master thread start time
    50   if ($line =~ /\[M\d?:(\d+)\] Starting on (.+)/)
    51   {
    52     $timing_data->{'M'}->{'S'} = $1;
    53     $timing_data->{'M'}->{'N'} = $2;
    54   }
    55   elsif ($line =~ /\[(W\d+):(\d+)\] Starting on (.+)/)
    56   {
    57     my $worker_id = $1;
    58     $timing_data->{$worker_id}->{'S'} = $2;
    59     $timing_data->{$worker_id}->{'N'} = $3;
    60   }
    61   elsif ($line =~ /\[(W\d+):(\d+)\] Processing/)
    62   {
    63     my $worker_id = $1;
    64     my $job_start_time = $2;
    65     $timing_data->{$worker_id}->{'F'}->{$job_start_time} = {'FN'=>'', 'PS'=>0, 'PE'=>0, 'E'=>0}; 
    66     $currently_processing->{$worker_id} = $job_start_time;
    67   }
    68   # 2.3 Or we may parse in the starting times for each working thread
    69   # 2.4 Or we may also parse (the last encountered) completion time for each
    70   #     working thread
    71   elsif ($line =~ /\[(W\d+):(\d+)\] Process complete/)
    72   {
    73     my $worker_id = $1;
    74     my $job_end_time = $2;
    75     $timing_data->{$worker_id}->{'E'} = $job_end_time;
    76     my $job_start_time = $currently_processing->{$worker_id};
    77     $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'E'} = $job_end_time;
    78     delete($currently_processing->{$worker_id});
    79   }
    80   # 2.5 Finally, we may parse in the Master thread end time
    81   elsif ($line =~ /\[M\d?:(\d+)\] Master will exit when workers complete/)
    82   {
    83     $timing_data->{'M'}->{'E'} = $1;
    84   }
    85 }
    86 close(LOGIN);
    87 
    88 # 3. Read each of worker logs parsing in information about the files processed
    89 #    - each will be stored (in an associative array) against its start time
    90 print " * Reading worker logs";
    91 foreach my $worker_id (nsort keys %{$timing_data})
    92 {
    93   my $jobs = $timing_data->{$worker_id}->{'F'};
    94   my $counter = 1;
    95   foreach my $job_start_time (sort keys %{$jobs})
    96   {
    97     my $log_filename = 'gsimport-' . $worker_id . '-' . $counter . '.log';
    98     print ".";
    99     open(WLOGIN, '<:utf8', $log_filename) or die("Error! Failed to open for reading: " . $log_filename);
    100     my $wline = '';
    101     while ($wline = <WLOGIN>)
    102     {
    103       if ($wline =~ /\[A:\d+\] SimpleVideoPlugin processing: (.+)/)
    104       {
    105         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'FN'} = $1;
    106       }
    107       # Start of video processing (excluding as much IO as possible)
    108       elsif ($wline =~ /\[C1:(\d+)\]/)
    109       {
    110         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PS'} = $1;
    111       }
    112       # Immediately after processing video
    113       elsif ($wline =~ /\[E2:(\d+)\]/)
    114       {
    115         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PE'} = $1;
    116       }
    117     }
    118 
    119     if ($timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PE'} <= 0)
    120     {
    121       print "\n[Warning - bogus log: $log_filename]";
    122     }
    123 
    124     close(WLOGIN);
    125     $counter++;
    126   }
    127 }
    128 print " Done!\n";
    129 
    130 # 4. Produce CSV of information
    131 print " * Generating timing information as CSV... ";
    132 open(CSVOUT, '>:utf8', 'timing.csv') or die('Error! Failed to open file for writing: timing.csv');
    133 print CSVOUT "number,id,hostname,start,end,hierarchy\n";
    134 my $thread_counter = 1;
    135 foreach my $thread (nsort keys %{$timing_data})
    136 {
    137   my $data = $timing_data->{$thread};
    138   print CSVOUT $thread_counter . ',' . $thread . ',' . $data->{'N'} . ',' . strftime("%H:%M:%S", localtime($data->{'S'})) . ',' . strftime("%H:%M:%S", localtime($data->{'E'})) . ',';
    139   if ($thread eq 'M')
    140   {
    141     print CSVOUT '0';
    142   }
    143   else
    144   {
    145     print CSVOUT '1';
    146   }
    147   print CSVOUT "\n";
    148   $thread_counter++;
    149 }
    150 close(CSVOUT);
    151 print "Done!\n";
    152 
    153 # 5. Produce pretty HTML chart of timing information including jobs
     70  die('Error! Failed to open file for reading: ' . $timing_csv_path);
     71}
     72my $number_of_workers = scalar(keys(%{$id_2_worker_id}));;
     73
     74# 3. Produce pretty HTML chart of timing information including jobs
    15475print " * Generating timing information as HTML... ";
    155 open(HTMLOUT, '>:utf8', 'gantt.html') or die('Error! Failed to open file for writing: gantt.html');
     76open(HTMLOUT, '>:utf8', $dir . '/gantt.html') or die('Error! Failed to open file for writing: gantt.html');
    15677print HTMLOUT "<html>\n";
    15778print HTMLOUT '<head>' . "\n";
     
    15980print HTMLOUT 'div.thread {position:relative}' . "\n";
    16081print HTMLOUT 'div.master {border:1px solid gray;color:white;font-weight:bold}' . "\n";
    161 print HTMLOUT 'div.worker {background-color:green;color:white;font-weight:bold}' . "\n";
     82print HTMLOUT 'div.worker {border:1px solid black;background-color:green;color:white;font-weight:bold}' . "\n";
    16283print HTMLOUT 'div.time {font-size:smaller;font-weight:normal}' . "\n";
    16384print HTMLOUT 'div.job {background-color:transparent;color:black;border:1px solid black;display:block;font-size:smaller;position:relative;text-align:center}' . "\n";
     
    195116        my $process_duration = $process_end - $process_start;
    196117        my $total_duration = $io_duration + $process_duration;
    197         ###rint "[DEBUG] filename: " . $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'} . "\n";
    198         ###rint "[DEBUG] start: $job_start ps: $process_start pe: $process_end end: $job_end\n";
    199         ###rint "[DEBUG] io: $io_duration process: $process_duration duration: $total_duration\n";
     118        &debugPrint("filename: " . $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'});
     119        &debugPrint("start: $job_start ps: $process_start pe: $process_end end: $job_end");
     120        &debugPrint("io: $io_duration process: $process_duration duration: $total_duration");
    200121        # Running stats
    201122        $total_io_time += $io_duration;
     
    210131        }
    211132      }
     133      # Shorten filename
     134      $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'} = substr($timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'}, length($import_dir) + 1);
    212135      $file_count++;
    213136    }
     
    216139my $avg_processing_time = floor(($total_io_time + $total_process_time) / $file_count);
    217140
     141print HTMLOUT "<tr><th>Import Directory:</th><td>" . $import_dir . "</td></tr>\n";
    218142print HTMLOUT "<tr><th>Processing Time:</th><td>" . &renderTime($total_duration) . "</td></tr>\n";
    219143print HTMLOUT "<tr><th>Processing Threads:</th><td>" . $number_of_workers . "</td></tr>\n";
     
    247171exit;
    248172
    249 # /**
    250 #  */
     173
     174## @function debugPrint()
     175#
     176sub debugPrint
     177{
     178  my $msg = shift(@_);
     179  if ($debug)
     180  {
     181    print STDERR '[DEBUG] ' . $msg . "\n";
     182  }
     183}
     184## debugPrint() ##
     185
     186
     187## @function filenameCat
     188#
     189sub filenameCat
     190{
     191  my $path = join('/', @_);
     192  $path =~ s/[\/\\]+/\//g;
     193  # protocols
     194  $path =~ s/^(HDFS|HDFSShell|HDThriftFS):\//$1:\/\//;
     195  return $path;
     196}
     197## filenameCat() ##
     198
     199## @function printUsage()
     200#
     201sub printUsage
     202{
     203  my $msg = shift(@_);
     204  if (defined $msg)
     205  {
     206    print 'Error! ' . $msg . "\n";
     207  }
     208  die("Usage: generate_gantt.pl <results dir> [<width in pixels>]\n\n");
     209}
     210## printUsage() ##
     211
     212
     213## @function longestCommonPath
     214#
     215sub longestCommonPath
     216{
     217  my ($path_new, $path_current) = @_;
     218  my $result = '';
     219  if (defined $path_current)
     220  {
     221    my @path_new_parts = split(/\//, $path_new);
     222    my @path_current_parts = split(/\//, $path_current);
     223    my @path_parts;
     224    for (my $i = 0; $i < scalar(@path_current_parts); $i++)
     225    {
     226      if ($path_current_parts[$i] eq $path_new_parts[$i])
     227      {
     228        push(@path_parts, $path_new_parts[$i]);
     229      }
     230      else
     231      {
     232        last;
     233      }
     234    }
     235    $result = &filenameCat(@path_parts);
     236  }
     237  else
     238  {
     239    $result = $path_new;
     240  }
     241  return $result;
     242}
     243## longestCommonPath() ##
     244
     245
     246## @function renderLine()
     247#
    251248sub renderLine
    252249{
    253250  my ($table_width, $start, $end, $class, $tname, $tstart, $tend, $jobs) = @_;
     251  &debugPrint("renderLine($table_width, $start, $end, $class, $tname, $tstart, $tend, <jobs>)");
    254252  # All timings need to be relative to 0 (relative start)
    255253  my $duration = $end - $start;
     
    311309  return $html;
    312310}
    313 # /** renderLine() **/
    314 
     311## renderLine() ##
     312
     313
     314## @function renderTime()
     315#
    315316sub renderTime
    316317{
Note: See TracChangeset for help on using the changeset viewer.