Changeset 27551

Show
Ignore:
Timestamp:
05.06.2013 13:07:43 (6 years ago)
Author:
jmt12
Message:

Altered so that it expects to be given a CSV containing parallel processing information (which can come from OpenMPI or Hadoop Greenstone imports). Also now determine the import dir by finding the longest common directory path in the files processed

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/generate_gantt.pl

    r27543 r27551  
    99print "\n===== Generate Timing (GANTT) =====\n"; 
    1010 
     11# 0. Configuration 
     12my $debug = 0; 
     13my $import_dir; 
     14 
    1115# 1. Initialization 
    12 #my $chart_width = 1024; 
    13 my $chart_width = 1536; 
    14 #my $chart_width = 2048; 
     16if (!defined $ARGV[0] || !-d $ARGV[0]) 
     17{ 
     18  &printUsage('Directory not provided or doesn\'t exist'); 
     19} 
     20my $dir = $ARGV[0]; 
     21my $timing_csv_path = &filenameCat($dir, 'timing.csv'); 
     22if (!-e $timing_csv_path) 
     23{ 
     24  &printUsage('Directory doesn\'t contain timing.csv: ' . $dir); 
     25} 
     26print 'Timing File: ' . $timing_csv_path . "\n"; 
     27my $chart_width = 1024; 
     28if (defined $ARGV[1]) 
     29{ 
     30  if ($ARGV[1] !~ /^\d+$/) 
     31  { 
     32    &printUsage('Chart width not a number'); 
     33  } 
     34  $chart_width = $ARGV[1]; 
     35} 
    1536print "Chart Width: " . $chart_width . "px\n"; 
    16 # 1.1 Store all information extracted in a cool data structure 
    17 # - N = hostname, S = thread start, E = thread end 
    18 my $timing_data = {'M' => {'N'=>'', 'S'=>0, 'E'=>0}}; 
    19 # 1.2 Check the file exists 
    20 if (!-f $ARGV[0]) 
    21 { 
    22   die("Error! File can't be read: " . $ARGV[0]); 
    23 } 
    24 my $main_log_filename = $ARGV[0]; 
    25 # 1.2 From the filename we can parse in some information like the number of worker threads 
    26 my $number_of_workers = 0; 
    27 if ($main_log_filename =~ /\-W(\d+)E/) 
    28 { 
    29   $number_of_workers = $1; 
     37print "===================================\n\n"; 
     38 
     39# Read in timing.csv and parse information into data structure 
     40my $timing_data = {}; 
     41my $id_2_worker_id = {}; 
     42if (open(TIN, '<:utf8', $timing_csv_path)) 
     43{ 
     44  my $line; 
     45  while ($line = <TIN>) 
     46  { 
     47    my @parts = split(/,/, $line); 
     48    if ($parts[1] eq 'M0') 
     49    { 
     50      $timing_data->{'M'} = {'N'=>$parts[2], 'S'=>$parts[3], 'E'=>$parts[4]}; 
     51    } 
     52    elsif ($parts[1] =~ /W\d+/) 
     53    { 
     54      $timing_data->{$parts[1]} = {'N'=>$parts[2], 'S'=>$parts[3], 'E'=>$parts[4], 'F'=>{}}; 
     55      $id_2_worker_id->{$parts[0]} = $parts[1]; 
     56    } 
     57    elsif ($parts[1] =~ /T\d+/) 
     58    { 
     59      my $worker_id = $id_2_worker_id->{$parts[7]}; 
     60      my $stop = $parts[4]; 
     61      my $filepath = $parts[8]; 
     62      $import_dir = &longestCommonPath($filepath, $import_dir); 
     63      $timing_data->{$worker_id}->{'F'}->{$parts[3]} = {'FN'=>$filepath, 'PS'=>($stop - $parts[5]), 'PE'=>$stop, 'E'=>$stop}; 
     64    } 
     65  } 
     66  close(TIN); 
    3067} 
    3168else 
    3269{ 
    33   die("Error! Malformed filename (expecting number of workers): " . $main_log_filename); 
    34 } 
    35 # 1.3 Initialize the data structure with the number of workers too (we don't 
    36 #     know the number of files yet, so they'll have to be adding on the fly) 
    37 for (my $i = 1; $i <= $number_of_workers; $i++) 
    38 { 
    39   $timing_data->{'W' . $i} = {'N'=>'', 'S'=>0, 'E'=>0, 'F'=>{}}; 
    40 } 
    41  
    42 # 2. Read in main log file 
    43 print " * Reading main log: " . $main_log_filename . "\n"; 
    44 open(LOGIN, '<:utf8', $main_log_filename) or die("Error! Failed to open file for reading: " . $main_log_filename); 
    45 my $line = ''; 
    46 my $currently_processing = {}; 
    47 while ($line = <LOGIN>) 
    48 { 
    49   # 2.1 Parse in the Master thread start time 
    50   if ($line =~ /\[M\d?:(\d+)\] Starting on (.+)/) 
    51   { 
    52     $timing_data->{'M'}->{'S'} = $1; 
    53     $timing_data->{'M'}->{'N'} = $2; 
    54   } 
    55   elsif ($line =~ /\[(W\d+):(\d+)\] Starting on (.+)/) 
    56   { 
    57     my $worker_id = $1; 
    58     $timing_data->{$worker_id}->{'S'} = $2; 
    59     $timing_data->{$worker_id}->{'N'} = $3; 
    60   } 
    61   elsif ($line =~ /\[(W\d+):(\d+)\] Processing/) 
    62   { 
    63     my $worker_id = $1; 
    64     my $job_start_time = $2; 
    65     $timing_data->{$worker_id}->{'F'}->{$job_start_time} = {'FN'=>'', 'PS'=>0, 'PE'=>0, 'E'=>0};   
    66     $currently_processing->{$worker_id} = $job_start_time; 
    67   } 
    68   # 2.3 Or we may parse in the starting times for each working thread 
    69   # 2.4 Or we may also parse (the last encountered) completion time for each 
    70   #     working thread 
    71   elsif ($line =~ /\[(W\d+):(\d+)\] Process complete/) 
    72   { 
    73     my $worker_id = $1; 
    74     my $job_end_time = $2; 
    75     $timing_data->{$worker_id}->{'E'} = $job_end_time; 
    76     my $job_start_time = $currently_processing->{$worker_id}; 
    77     $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'E'} = $job_end_time; 
    78     delete($currently_processing->{$worker_id}); 
    79   } 
    80   # 2.5 Finally, we may parse in the Master thread end time 
    81   elsif ($line =~ /\[M\d?:(\d+)\] Master will exit when workers complete/) 
    82   { 
    83     $timing_data->{'M'}->{'E'} = $1; 
    84   } 
    85 } 
    86 close(LOGIN); 
    87  
    88 # 3. Read each of worker logs parsing in information about the files processed 
    89 #    - each will be stored (in an associative array) against its start time 
    90 print " * Reading worker logs"; 
    91 foreach my $worker_id (nsort keys %{$timing_data}) 
    92 { 
    93   my $jobs = $timing_data->{$worker_id}->{'F'}; 
    94   my $counter = 1; 
    95   foreach my $job_start_time (sort keys %{$jobs}) 
    96   { 
    97     my $log_filename = 'gsimport-' . $worker_id . '-' . $counter . '.log'; 
    98     print "."; 
    99     open(WLOGIN, '<:utf8', $log_filename) or die("Error! Failed to open for reading: " . $log_filename); 
    100     my $wline = ''; 
    101     while ($wline = <WLOGIN>) 
    102     { 
    103       if ($wline =~ /\[A:\d+\] SimpleVideoPlugin processing: (.+)/) 
    104       { 
    105         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'FN'} = $1; 
    106       } 
    107       # Start of video processing (excluding as much IO as possible)  
    108       elsif ($wline =~ /\[C1:(\d+)\]/) 
    109       { 
    110         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PS'} = $1; 
    111       } 
    112       # Immediately after processing video 
    113       elsif ($wline =~ /\[E2:(\d+)\]/) 
    114       { 
    115         $timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PE'} = $1; 
    116       } 
    117     } 
    118  
    119     if ($timing_data->{$worker_id}->{'F'}->{$job_start_time}->{'PE'} <= 0) 
    120     { 
    121       print "\n[Warning - bogus log: $log_filename]"; 
    122     } 
    123  
    124     close(WLOGIN); 
    125     $counter++; 
    126   } 
    127 } 
    128 print " Done!\n"; 
    129  
    130 # 4. Produce CSV of information 
    131 print " * Generating timing information as CSV... "; 
    132 open(CSVOUT, '>:utf8', 'timing.csv') or die('Error! Failed to open file for writing: timing.csv'); 
    133 print CSVOUT "number,id,hostname,start,end,hierarchy\n"; 
    134 my $thread_counter = 1; 
    135 foreach my $thread (nsort keys %{$timing_data}) 
    136 { 
    137   my $data = $timing_data->{$thread}; 
    138   print CSVOUT $thread_counter . ',' . $thread . ',' . $data->{'N'} . ',' . strftime("%H:%M:%S", localtime($data->{'S'})) . ',' . strftime("%H:%M:%S", localtime($data->{'E'})) . ','; 
    139   if ($thread eq 'M') 
    140   { 
    141     print CSVOUT '0'; 
    142   } 
    143   else 
    144   { 
    145     print CSVOUT '1'; 
    146   } 
    147   print CSVOUT "\n"; 
    148   $thread_counter++; 
    149 } 
    150 close(CSVOUT); 
    151 print "Done!\n"; 
    152  
    153 # 5. Produce pretty HTML chart of timing information including jobs 
     70  die('Error! Failed to open file for reading: ' . $timing_csv_path); 
     71} 
     72my $number_of_workers = scalar(keys(%{$id_2_worker_id}));; 
     73 
     74# 3. Produce pretty HTML chart of timing information including jobs 
    15475print " * Generating timing information as HTML... "; 
    155 open(HTMLOUT, '>:utf8', 'gantt.html') or die('Error! Failed to open file for writing: gantt.html'); 
     76open(HTMLOUT, '>:utf8', $dir . '/gantt.html') or die('Error! Failed to open file for writing: gantt.html'); 
    15677print HTMLOUT "<html>\n"; 
    15778print HTMLOUT '<head>' . "\n"; 
     
    15980print HTMLOUT 'div.thread {position:relative}' . "\n"; 
    16081print HTMLOUT 'div.master {border:1px solid gray;color:white;font-weight:bold}' . "\n"; 
    161 print HTMLOUT 'div.worker {background-color:green;color:white;font-weight:bold}' . "\n"; 
     82print HTMLOUT 'div.worker {border:1px solid black;background-color:green;color:white;font-weight:bold}' . "\n"; 
    16283print HTMLOUT 'div.time {font-size:smaller;font-weight:normal}' . "\n"; 
    16384print HTMLOUT 'div.job {background-color:transparent;color:black;border:1px solid black;display:block;font-size:smaller;position:relative;text-align:center}' . "\n"; 
     
    195116        my $process_duration = $process_end - $process_start; 
    196117        my $total_duration = $io_duration + $process_duration; 
    197         ###rint "[DEBUG] filename: " . $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'} . "\n"; 
    198         ###rint "[DEBUG] start: $job_start ps: $process_start pe: $process_end end: $job_end\n"; 
    199         ###rint "[DEBUG] io: $io_duration process: $process_duration duration: $total_duration\n"; 
     118        &debugPrint("filename: " . $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'}); 
     119        &debugPrint("start: $job_start ps: $process_start pe: $process_end end: $job_end"); 
     120        &debugPrint("io: $io_duration process: $process_duration duration: $total_duration"); 
    200121        # Running stats 
    201122        $total_io_time += $io_duration; 
     
    210131        } 
    211132      } 
     133      # Shorten filename 
     134      $timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'} = substr($timing_data->{$worker_id}->{'F'}->{$job_start}->{'FN'}, length($import_dir) + 1); 
    212135      $file_count++; 
    213136    } 
     
    216139my $avg_processing_time = floor(($total_io_time + $total_process_time) / $file_count); 
    217140 
     141print HTMLOUT "<tr><th>Import Directory:</th><td>" . $import_dir . "</td></tr>\n"; 
    218142print HTMLOUT "<tr><th>Processing Time:</th><td>" . &renderTime($total_duration) . "</td></tr>\n"; 
    219143print HTMLOUT "<tr><th>Processing Threads:</th><td>" . $number_of_workers . "</td></tr>\n"; 
     
    247171exit; 
    248172 
    249 # /** 
    250 #  */ 
     173 
     174## @function debugPrint() 
     175# 
     176sub debugPrint 
     177{ 
     178  my $msg = shift(@_); 
     179  if ($debug) 
     180  { 
     181    print STDERR '[DEBUG] ' . $msg . "\n"; 
     182  } 
     183} 
     184## debugPrint() ## 
     185 
     186 
     187## @function filenameCat 
     188# 
     189sub filenameCat 
     190{ 
     191  my $path = join('/', @_); 
     192  $path =~ s/[\/\\]+/\//g; 
     193  # protocols 
     194  $path =~ s/^(HDFS|HDFSShell|HDThriftFS):\//$1:\/\//; 
     195  return $path; 
     196} 
     197## filenameCat() ## 
     198 
     199## @function printUsage() 
     200# 
     201sub printUsage 
     202{ 
     203  my $msg = shift(@_); 
     204  if (defined $msg) 
     205  { 
     206    print 'Error! ' . $msg . "\n"; 
     207  } 
     208  die("Usage: generate_gantt.pl <results dir> [<width in pixels>]\n\n"); 
     209} 
     210## printUsage() ## 
     211 
     212 
     213## @function longestCommonPath 
     214# 
     215sub longestCommonPath 
     216{ 
     217  my ($path_new, $path_current) = @_; 
     218  my $result = ''; 
     219  if (defined $path_current) 
     220  { 
     221    my @path_new_parts = split(/\//, $path_new); 
     222    my @path_current_parts = split(/\//, $path_current); 
     223    my @path_parts; 
     224    for (my $i = 0; $i < scalar(@path_current_parts); $i++) 
     225    { 
     226      if ($path_current_parts[$i] eq $path_new_parts[$i]) 
     227      { 
     228        push(@path_parts, $path_new_parts[$i]); 
     229      } 
     230      else 
     231      { 
     232        last; 
     233      } 
     234    } 
     235    $result = &filenameCat(@path_parts); 
     236  } 
     237  else 
     238  { 
     239    $result = $path_new; 
     240  } 
     241  return $result; 
     242} 
     243## longestCommonPath() ## 
     244 
     245 
     246## @function renderLine() 
     247# 
    251248sub renderLine 
    252249{ 
    253250  my ($table_width, $start, $end, $class, $tname, $tstart, $tend, $jobs) = @_; 
     251  &debugPrint("renderLine($table_width, $start, $end, $class, $tname, $tstart, $tend, <jobs>)"); 
    254252  # All timings need to be relative to 0 (relative start) 
    255253  my $duration = $end - $start; 
     
    311309  return $html; 
    312310} 
    313 # /** renderLine() **/ 
    314  
     311## renderLine() ## 
     312 
     313 
     314## @function renderTime() 
     315# 
    315316sub renderTime 
    316317{