source: gs2-extensions/parallel-building/trunk/src/bin/script/dlreport.pl@ 28645

Last change on this file since 28645 was 28645, checked in by jmt12, 10 years ago

Script to generate a report on data locality from GreenstoneHadoop logs

  • Property svn:executable set to *
File size: 8.0 KB
Line 
1#!/usr/bin/perl
2
3use strict;
4use warnings;
5
6use List::Util qw(sum);
7use Sort::Key::Natural qw(natsort);
8
9my $data = {};
10my $base_dir = '/research/jmt12/temp';
11my $filename = 'replication';
12if (defined $ARGV[0])
13{
14 $filename = $ARGV[0];
15}
16my $path = $base_dir . '/' . $filename . '.csv';
17if (!-f $path)
18{
19 die('File not found: ' . $path);
20}
21
22if (open(FIN, '<:utf8', $path))
23{
24 my $line = '';
25 while ($line = <FIN>)
26 {
27 print STDERR '[debug] line: ' . $line . "\n";
28 if ($line =~ /^(\d+),(\d+),(\d+)/)
29 {
30 my $replication = $1;
31 my $test_run = $2;
32 my $epoch = $3;
33 my $avgtime = '???';
34 my $avgiotime = '???';
35 my $dl = '???';
36 # Locate gantt chart
37 my $gantt_path = $base_dir . '/' . $filename . '/' . $epoch . '/' . $epoch . '-gantt.html';
38 print STDERR ' * Searching for: ' . $gantt_path . "\n";
39 if(open(GIN, '<:utf8', $gantt_path))
40 {
41 my $line2 = '';
42 while ($line = <GIN>)
43 {
44 if ($line =~ /<th>Average Processing Time:<\/th><td>([0-9hms]+)<\/td>/)
45 {
46 $avgtime = &parseTime($1);
47 }
48 if ($line =~ /<th>Average File IO Time:<\/th><td>([0-9hms]+)<\/td>/)
49 {
50 $avgiotime = &parseTime($1);
51 }
52 if ($line =~ /<th>Data Locality:<\/th><td>(\d+)%/)
53 {
54 $dl = $1;
55 }
56 }
57 close(GIN);
58 }
59 else
60 {
61 print STDERR 'Warning! Failed to find chart: ' . $gantt_path . "\n";
62 }
63
64 if ($avgtime eq '???')
65 {
66 die("Failed to parse timing information from: " . $gantt_path);
67 }
68
69 # Store for averaging
70 if (!defined $data->{$replication})
71 {
72 $data->{$replication} = {'count' => 0,
73 'epochs' => [],
74 'ios' => [],
75 'times' => [],
76 'dls' => []
77 };
78 }
79 $data->{$replication}->{'count'}++;
80 push(@{$data->{$replication}->{'epochs'}}, $epoch);
81 push(@{$data->{$replication}->{'ios'}}, $avgiotime);
82 push(@{$data->{$replication}->{'times'}}, $avgtime);
83 push(@{$data->{$replication}->{'dls'}}, $dl);
84 }
85 }
86 close(FIN);
87}
88else
89{
90 die('Error! Failed to open file for reading: replication.csv');
91}
92
93# Perform some calculations
94foreach my $replication (natsort keys %{$data})
95{
96 my $variables = {'pt' => 'times', 'io' => 'ios', 'dl' => 'dls'};
97 foreach my $prefix (keys %{$variables})
98 {
99 my $values_name = $variables->{$prefix};
100 $data->{$replication}->{$prefix . '_mean'} = &calculateMean($data->{$replication}->{$values_name});
101 $data->{$replication}->{$prefix . '_median'} = &calculateMedian($data->{$replication}->{$values_name});
102 $data->{$replication}->{$prefix . '_stddev'} = &calculateStandardDeviation($data->{$replication}->{$values_name}, $data->{$replication}->{$prefix . '_mean'});
103 my $radius = 2 * $data->{$replication}->{$prefix . '_stddev'};
104 $data->{$replication}->{$prefix . '_lbound'} = $data->{$replication}->{$prefix . '_mean'} - $radius;
105 $data->{$replication}->{$prefix . '_ubound'} = $data->{$replication}->{$prefix . '_mean'} + $radius;
106 # Special cases for percentages, which can't be less than 0 nor greater than 100
107 if ($prefix eq 'dl')
108 {
109 if ($data->{$replication}->{$prefix . '_lbound'} < 0)
110 {
111 $data->{$replication}->{$prefix . '_lbound'} = 0;
112 }
113 if ($data->{$replication}->{$prefix . '_ubound'} > 100)
114 {
115 $data->{$replication}->{$prefix . '_ubound'} = 100;
116 }
117 }
118 }
119}
120
121print '<html>
122<head>
123<style>
124table {
125 border:1px solid black;
126 border-collapse:collapse;
127 margin-left:auto;
128 margin-right:auto;
129 width:80%;
130}
131td {
132 border:1px solid black;
133 padding:2px;
134 text-align:right;
135}
136th {
137 border:1px solid black;
138 background-color:#C7C7C7;
139}
140</style>
141</head>
142<body>';
143
144print '<h1>Data Locality Report</h1>';
145
146print '<ul><li><a href="#raw">Raw Data</a></li><li><a href="#averaged">Averaged</a></li></ul>';
147
148print '<h2><a name="raw"></a>Raw Data</h2>
149<table>
150 <tr>
151 <th rowspan="2">Replication</th>
152 <th rowspan="2">Epoch</th>
153 <th colspan="3">Avg Per File</th>
154 <th rowspan="2">DataLocality</th>
155 </tr>
156 <tr>
157 <th>IO</th><th>CPU</th><th>Total</th>
158 </tr>
159';
160foreach my $replication (natsort keys %{$data})
161{
162 for (my $test_run = 0; $test_run < $data->{$replication}->{'count'}; $test_run++)
163 {
164 my $epoch = @{$data->{$replication}->{'epochs'}}[$test_run];
165 my $avgiotime = @{$data->{$replication}->{'ios'}}[$test_run];
166 my $avgtime = @{$data->{$replication}->{'times'}}[$test_run];
167 my $dl = @{$data->{$replication}->{'dls'}}[$test_run];
168 print sprintf('<tr><th><a name="result%d.%d" href="#avg%d">%2d</a></th><td><a href="%s/%d/%d-gantt.html">%d</a></td><td>%4d</td><td>%4d</td><td>%4d</td><td>%3d%%</td></tr>', $replication, $test_run, $replication, $replication, $filename, $epoch, $epoch, $epoch, $avgiotime, ($avgtime - $avgiotime), $avgtime, $dl) . "\n";
169 }
170}
171print '</table>';
172print '<a href="#">back to top</a><br />';
173
174
175print '<h2><a name="averaged"></a>Averaged</h2>';
176print '<table><tr><th rowspan="2">Replication</th><th rowspan="2">Count</th><th colspan="5">Processing Time (s)</th><th colspan="5">IO Time (s)</th><th colspan="5">Data Locality (%)</th></tr>
177<tr>';
178for (my $i = 0; $i < 3; $i++)
179{
180 print '<th>Median</th><th>Mean</th><th>StdDev</th><th>LBound</th><th>UBound</th>';
181}
182print '</tr>';
183foreach my $replication (natsort keys %{$data})
184{
185 my $count = $data->{$replication}->{'count'};
186 my $sum_dl = sum(@{$data->{$replication}->{'dls'}});
187 my $avg_dl = $sum_dl / $count;
188 print '<tr><th><a name="avg' . $replication . '" href="#result' . $replication . '.0">' . $replication . '</a></th><td>' . $count . '</td>';
189 # Processing Time (pt)
190 print renderStatisticsAsHTML($data->{$replication}, 'pt');
191 # IO Time (io)
192 print renderStatisticsAsHTML($data->{$replication}, 'io');
193 # Data Locality Percentages (dl)
194 print renderStatisticsAsHTML($data->{$replication}, 'dl', '%');
195 print "</tr>\n";
196}
197print '</table>';
198print '<a href="#">back to top</a>';
199print '</html>';
200
201exit;
202
203## @function calculateMean()
204sub calculateMean
205{
206 my ($data) = @_;
207 my $count = scalar(@{$data});
208 if ($count == 0)
209 {
210 die("Empty array\n");
211 }
212 my $total = 0;
213 foreach (@{$data})
214 {
215 $total += $_;
216 }
217 my $average = $total / $count;
218 return $average;
219}
220## calculateMean() ##
221
222## @function calculateMedian()
223sub calculateMedian
224{
225 my ($data) = @_;
226 my @vals = sort {$a <=> $b} @{$data};
227 my $len = @vals;
228 if($len%2) #odd?
229 {
230 return $vals[int($len/2)];
231 }
232 else #even
233 {
234 return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
235 }
236}
237## calculateMedian() ##
238
239## @function calculateStandardDeviation()
240sub calculateStandardDeviation
241{
242 my ($data, $average) = @_;
243 my $count = scalar(@{$data});
244 if ($count == 1)
245 {
246 return 0;
247 }
248 if (!defined $average)
249 {
250 $average = &calculateMean($data);
251 }
252 my $sqtotal = 0;
253 foreach (@{$data})
254 {
255 $sqtotal += ($average - $_) ** 2;
256 }
257 my $std = ($sqtotal / ($count - 1)) ** 0.5;
258 return $std;
259}
260## calculateStandardDeviation() ##
261
262sub parseTime
263{
264 my ($raw_time_str) = @_;
265 my $time_in_seconds = 0;
266 if ($raw_time_str =~ /(\d+)h/)
267 {
268 $time_in_seconds += $1 * 60 * 60;
269 }
270 if ($raw_time_str =~ /(\d+)m/)
271 {
272 $time_in_seconds += $1 * 60;
273 }
274 if ($raw_time_str =~ /(\d+)s/)
275 {
276 $time_in_seconds += $1;
277 }
278 return $time_in_seconds;
279}
280
281## @function renderStatisticsAsHTML
282sub renderStatisticsAsHTML
283{
284 my ($data, $prefix, $suffix) = @_;
285 if (!defined $suffix)
286 {
287 $suffix = '';
288 }
289 my $html = '';
290 $html .= '<td>' . $data->{$prefix . '_median'} . $suffix . '</td>';
291 $html .= '<td>' . sprintf('%0.2f', $data->{$prefix . '_mean'}) . $suffix . '</td>';
292 $html .= '<td>' . sprintf('%0.2f', $data->{$prefix . '_stddev'}) . $suffix . '</td>';
293 $html .= '<td>' . sprintf('%0.2f', $data->{$prefix . '_lbound'}) . $suffix . '</td>';
294 $html .= '<td>' . sprintf('%0.2f', $data->{$prefix . '_ubound'}) . $suffix . '</td>';
295 return $html;
296}
297## renderStatisticsAsHTML() ##
Note: See TracBrowser for help on using the repository browser.