source: other-projects/maori-lang-detection/journal-paper/CommonCrawl_flow.svg

Last change on this file was 33856, checked in by ak19, 4 years ago

Forgot to commit. Last week, Dr Bainbridge had properly cropped the SVG flow chart of the common crawl to mongodb process in Inkscape. He'd also exported a PDF of it cropped to the bounds, which I'm also adding to SVN now.

File size: 51.9 KB
RevLine 
[33840]1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3
4<svg
5 xmlns:dc="http://purl.org/dc/elements/1.1/"
6 xmlns:cc="http://creativecommons.org/ns#"
7 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
8 xmlns:svg="http://www.w3.org/2000/svg"
9 xmlns="http://www.w3.org/2000/svg"
10 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 width="210mm"
13 height="297mm"
14 viewBox="0 0 210 297"
15 version="1.1"
[33856]16 id="svg3410"
[33840]17 inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
[33841]18 sodipodi:docname="CommonCrawl_flow2.svg">
[33840]19 <defs
[33856]20 id="defs3404">
[33840]21 <marker
22 inkscape:stockid="Arrow2Lstart"
23 orient="auto"
[33856]24 refY="0"
25 refX="0"
26 id="Arrow2Lstart-2-8-8-6"
[33840]27 style="overflow:visible"
28 inkscape:isstock="true">
29 <path
[33856]30 inkscape:connector-curvature="0"
31 id="path6664-1-8-8-0"
32 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
33 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
34 transform="matrix(1.1,0,0,1.1,1.1,0)" />
[33840]35 </marker>
36 <marker
37 inkscape:stockid="Arrow2Lstart"
38 orient="auto"
39 refY="0"
40 refX="0"
[33856]41 id="Arrow2Lstart-2-9-2-8"
[33840]42 style="overflow:visible"
43 inkscape:isstock="true">
44 <path
45 inkscape:connector-curvature="0"
[33856]46 id="path6664-1-1-3-1"
[33840]47 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
48 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
49 transform="matrix(1.1,0,0,1.1,1.1,0)" />
50 </marker>
51 <marker
52 inkscape:stockid="Arrow2Lstart"
53 orient="auto"
54 refY="0"
55 refX="0"
[33856]56 id="Arrow2Lstart-2-8-8"
[33840]57 style="overflow:visible"
58 inkscape:isstock="true">
59 <path
60 inkscape:connector-curvature="0"
[33856]61 id="path6664-1-8-8"
[33840]62 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
63 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
64 transform="matrix(1.1,0,0,1.1,1.1,0)" />
65 </marker>
66 <marker
67 inkscape:stockid="Arrow2Lstart"
68 orient="auto"
69 refY="0"
70 refX="0"
[33856]71 id="Arrow2Lstart-2-9-2"
[33840]72 style="overflow:visible"
73 inkscape:isstock="true">
74 <path
75 inkscape:connector-curvature="0"
[33856]76 id="path6664-1-1-3"
[33840]77 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
78 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
79 transform="matrix(1.1,0,0,1.1,1.1,0)" />
80 </marker>
81 <marker
82 inkscape:stockid="Arrow2Lstart"
83 orient="auto"
84 refY="0"
85 refX="0"
[33856]86 id="Arrow2Lstart-2-9"
[33840]87 style="overflow:visible"
88 inkscape:isstock="true">
89 <path
90 inkscape:connector-curvature="0"
[33856]91 id="path6664-1-1"
[33840]92 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
93 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
94 transform="matrix(1.1,0,0,1.1,1.1,0)" />
95 </marker>
96 <marker
97 inkscape:stockid="Arrow2Lstart"
98 orient="auto"
99 refY="0"
100 refX="0"
[33856]101 id="Arrow2Lstart-2-8"
[33840]102 style="overflow:visible"
103 inkscape:isstock="true">
104 <path
105 inkscape:connector-curvature="0"
[33856]106 id="path6664-1-8"
[33840]107 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
108 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
109 transform="matrix(1.1,0,0,1.1,1.1,0)" />
110 </marker>
[33841]111 <marker
112 inkscape:stockid="Arrow2Lstart"
113 orient="auto"
114 refY="0"
115 refX="0"
[33856]116 id="Arrow2Lstart-2"
[33841]117 style="overflow:visible"
118 inkscape:isstock="true">
119 <path
120 inkscape:connector-curvature="0"
[33856]121 id="path6664-1"
[33841]122 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
123 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
124 transform="matrix(1.1,0,0,1.1,1.1,0)" />
125 </marker>
126 <marker
127 inkscape:stockid="Arrow2Lstart"
128 orient="auto"
129 refY="0"
130 refX="0"
[33856]131 id="Arrow2Lstart"
[33841]132 style="overflow:visible"
133 inkscape:isstock="true">
134 <path
135 inkscape:connector-curvature="0"
[33856]136 id="path6664"
[33841]137 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
138 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
139 transform="matrix(1.1,0,0,1.1,1.1,0)" />
140 </marker>
[33840]141 </defs>
142 <sodipodi:namedview
143 id="base"
144 pagecolor="#ffffff"
145 bordercolor="#666666"
146 borderopacity="1.0"
147 inkscape:pageopacity="0.0"
148 inkscape:pageshadow="2"
149 inkscape:zoom="0.7"
[33856]150 inkscape:cx="229.97759"
151 inkscape:cy="567.58817"
[33840]152 inkscape:document-units="mm"
[33856]153 inkscape:current-layer="layer1"
[33840]154 showgrid="false"
155 inkscape:window-width="1680"
156 inkscape:window-height="988"
157 inkscape:window-x="-8"
158 inkscape:window-y="-8"
[33856]159 inkscape:window-maximized="1" />
[33840]160 <metadata
[33856]161 id="metadata3407">
[33840]162 <rdf:RDF>
163 <cc:Work
164 rdf:about="">
165 <dc:format>image/svg+xml</dc:format>
166 <dc:type
167 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
[33856]168 <dc:title></dc:title>
[33840]169 </cc:Work>
170 </rdf:RDF>
171 </metadata>
172 <g
173 inkscape:label="Layer 1"
174 inkscape:groupmode="layer"
175 id="layer1">
176 <g
[33856]177 id="g13905"
178 transform="translate(-2.6849314,-0.31280524)">
179 <rect
180 y="8.9821434"
181 x="36.174416"
182 height="32.630665"
183 width="26.947613"
184 id="rect5954"
185 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
186 <flowRoot
[33840]187 xml:space="preserve"
[33856]188 id="flowRoot6219"
189 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
190 transform="matrix(0.26458333,0,0,0.26458333,20.323189,-24.166702)"><flowRegion
191 id="flowRegion6221"><rect
192 id="rect6223"
193 width="54.285713"
194 height="37.142857"
195 x="310"
196 y="185.37683" /></flowRegion><flowPara
197 id="flowPara6225">...</flowPara></flowRoot> <flowRoot
198 transform="matrix(0.26458333,0,0,0.26458333,1.8898848,4.5357143)"
199 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
200 id="flowRoot13798"
201 xml:space="preserve"><flowRegion
202 id="flowRegion13800"><rect
203 y="33.948257"
204 x="136.72218"
205 height="106.18565"
206 width="94.70639"
207 id="rect13802" /></flowRegion><flowPara
208 id="flowPara13804">CC Sep 2018</flowPara><flowPara
209 id="flowPara13806">Columnar Index</flowPara><flowPara
210 id="flowPara13808" /></flowRoot> <rect
211 y="8.8441896"
212 x="69.302986"
213 height="32.630665"
214 width="26.947613"
215 id="rect5954-4"
216 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
[33840]217 <flowRoot
[33856]218 transform="matrix(0.26458333,0,0,0.26458333,35.018452,4.3977563)"
219 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
220 id="flowRoot13798-0"
221 xml:space="preserve"><flowRegion
222 id="flowRegion13800-2"><rect
223 y="33.948257"
224 x="136.72218"
225 height="106.18565"
226 width="94.70639"
227 id="rect13802-0" /></flowRegion><flowPara
228 id="flowPara13804-8">CC Oct 2018</flowPara><flowPara
229 id="flowPara13806-1">Columnar Index</flowPara><flowPara
230 id="flowPara13808-6" /></flowRoot> <rect
231 y="8.9197874"
232 x="113.52618"
233 height="32.630665"
234 width="26.947613"
235 id="rect5954-1"
236 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
237 <flowRoot
238 transform="matrix(0.26458333,0,0,0.26458333,79.241667,4.4733578)"
239 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
240 id="flowRoot13798-09"
241 xml:space="preserve"><flowRegion
242 id="flowRegion13800-0"><rect
243 y="33.948257"
244 x="136.72218"
245 height="106.18565"
246 width="94.70639"
247 id="rect13802-07" /></flowRegion><flowPara
248 id="flowPara13876">CC Aug 2019</flowPara><flowPara
249 id="flowPara13806-6">Columnar Index</flowPara><flowPara
250 id="flowPara13808-1" /></flowRoot> </g>
251 <g
252 transform="translate(-2.6849314,-0.31280524)"
[33840]253 id="g8700">
254 <flowRoot
255 transform="matrix(0.26458333,0,0,0.26458333,2.6458333,3.4017858)"
256 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
257 id="flowRoot6392"
258 xml:space="preserve"><flowRegion
259 id="flowRegion6394"><rect
260 y="308.23401"
261 x="78.571426"
262 height="91.428543"
263 width="192.85715"
264 id="rect6396" /></flowRegion><flowPara
[33841]265 id="flowPara6404" /></flowRoot> <g
266 id="g14156"
267 transform="translate(0,-14.81667)">
[33840]268 <rect
[33841]269 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
270 id="rect6388"
[33840]271 width="51.026787"
272 height="24.190477"
[33841]273 x="20.788691"
274 y="81.553574" />
[33840]275 <rect
[33841]276 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
277 id="rect6390"
278 width="15.497024"
279 height="5.2916689"
280 x="20.788691"
281 y="76.261909" />
282 <g
283 id="g6579"
284 transform="translate(-32.883929,-20.197169)">
285 <rect
286 y="103.47619"
287 x="99.974701"
288 height="24.190477"
289 width="51.026787"
290 id="rect6388-2"
291 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
292 <rect
293 y="98.184525"
294 x="99.974701"
295 height="5.2916694"
296 width="15.497025"
297 id="rect6390-7"
298 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
299 </g>
300 <g
301 id="g6630"
302 transform="translate(61.241527,-46.849824)">
303 <rect
304 y="131.19792"
305 x="51.460232"
306 height="24.190477"
307 width="51.026787"
308 id="rect6388-8-4"
309 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" />
310 <rect
311 y="125.90625"
312 x="51.460232"
313 height="5.2916694"
314 width="15.497025"
315 id="rect6390-9-4"
316 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" />
317 </g>
318 <g
319 id="g6591"
320 transform="translate(21.43299,-58.018398)">
321 <rect
322 y="143.1637"
323 x="111.31399"
324 height="24.190477"
325 width="51.026787"
326 id="rect6388-8"
327 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
328 <rect
329 y="137.87202"
330 x="111.31399"
331 height="5.2916694"
332 width="15.497025"
333 id="rect6390-9"
334 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
335 </g>
[33840]336 <flowRoot
[33841]337 transform="matrix(0.26458333,0,0,0.26458333,4.9136906,4.8380953)"
338 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
339 id="flowRoot14053"
340 xml:space="preserve"><flowRegion
341 id="flowRegion14055"><rect
342 y="308.23401"
[33840]343 x="78.571426"
[33841]344 height="52.235638"
345 width="142.14285"
346 id="rect14057" /></flowRegion><flowPara
347 id="flowPara14059">*.warc.wet files</flowPara><flowPara
348 id="flowPara14061">CC Sep 2018</flowPara></flowRoot> <flowRoot
349 transform="matrix(0.26458333,0,0,0.26458333,50.006029,7.3307912)"
350 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
351 id="flowRoot14053-6"
352 xml:space="preserve"><flowRegion
353 id="flowRegion14055-5"><rect
354 y="308.23401"
355 x="78.571426"
356 height="52.235638"
357 width="142.14285"
358 id="rect14057-1" /></flowRegion><flowPara
359 id="flowPara14059-3">*.warc.wet files</flowPara><flowPara
360 id="flowPara14061-0">CC Oct 2018</flowPara></flowRoot> <flowRoot
361 transform="matrix(0.26458333,0,0,0.26458333,114.78021,8.3683492)"
362 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
363 id="flowRoot14053-6-6"
364 xml:space="preserve"><flowRegion
365 id="flowRegion14055-5-5"><rect
366 y="308.23401"
367 x="78.571426"
368 height="52.235638"
369 width="142.14285"
370 id="rect14057-1-8" /></flowRegion><flowPara
371 id="flowPara14059-3-3">*.warc.wet files</flowPara><flowPara
372 id="flowPara14061-0-1">CC Aug 2019</flowPara></flowRoot> </g>
[33840]373 </g>
374 <path
375 style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart)"
[33856]376 d="M 88.878446,63.220066 C 88.346192,43.177342 90.224644,43.177342 90.224644,43.177342"
[33840]377 id="path6644"
378 inkscape:connector-curvature="0" />
379 <text
380 xml:space="preserve"
381 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
[33856]382 x="123.35426"
383 y="48.80611"
[33840]384 id="text8548"><tspan
385 sodipodi:role="line"
[33856]386 x="123.35426"
387 y="48.80611"
[33840]388 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
389 id="tspan8550">content_languages = 'mri'</tspan><tspan
390 sodipodi:role="line"
[33856]391 x="123.35426"
392 y="53.744999"
[33840]393 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
394 id="tspan8558">+</tspan><tspan
395 sodipodi:role="line"
[33856]396 x="123.35426"
397 y="58.683887"
[33840]398 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
399 id="tspan8560">warc to wet</tspan></text>
400 <flowRoot
[33856]401 transform="matrix(0.26458333,0,0,0.26458333,40.471328,109.86496)"
[33840]402 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
403 id="flowRoot6219-8"
404 xml:space="preserve"><flowRegion
405 id="flowRegion6221-7"><rect
406 y="185.37683"
407 x="310"
408 height="37.142857"
409 width="54.285713"
410 id="rect6223-6" /></flowRegion><flowPara
411 id="flowPara6225-5">...</flowPara></flowRoot> <g
412 id="g8667"
[33856]413 transform="translate(-6.7907814,21.476664)">
[33840]414 <rect
415 y="124.0766"
416 x="24.588797"
417 height="21.381561"
418 width="48.375786"
419 id="rect8562"
420 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
421 <rect
422 y="118.99849"
423 x="24.588797"
424 height="5.0781121"
425 width="14.165285"
426 id="rect8564"
427 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
428 <flowRoot
429 transform="matrix(0.26458333,0,0,0.25571898,0.53453906,7.6615679)"
430 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
431 id="flowRoot8566"
432 xml:space="preserve"><flowRegion
433 id="flowRegion8568"><rect
434 y="468.95096"
435 x="92.934036"
436 height="81.405365"
437 width="182.83763"
438 id="rect8570" /></flowRegion><flowPara
439 id="flowPara8572">site 00001</flowPara><flowPara
440 id="flowPara8574">seedURLs + url filters</flowPara></flowRoot> </g>
441 <g
442 id="g8709"
[33856]443 transform="translate(6.2625586,31.663979)">
[33840]444 <g
445 id="g8744">
446 <rect
447 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
448 id="rect8562-6"
449 width="48.375786"
450 height="21.381561"
451 x="127.08665"
452 y="123.94297" />
453 <rect
454 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
455 id="rect8564-8"
456 width="14.165285"
457 height="5.0781121"
458 x="127.08665"
459 y="118.86487" />
460 <flowRoot
461 xml:space="preserve"
462 id="flowRoot8566-8"
463 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
464 transform="matrix(0.26458333,0,0,0.25571898,103.0324,7.5279456)"><flowRegion
465 id="flowRegion8568-0"><rect
466 id="rect8570-8"
467 width="182.83763"
468 height="81.405365"
469 x="92.934036"
470 y="468.95096" /></flowRegion><flowPara
471 id="flowPara8572-9">site 014##</flowPara><flowPara
472 id="flowPara8574-1">seedURLs + url filters</flowPara></flowRoot> </g>
473 </g>
474 <g
475 id="g8658"
[33856]476 transform="translate(-12.536961,-1.8138882)">
[33840]477 <rect
478 y="153.60989"
479 x="81.918106"
480 height="21.381561"
481 width="48.375786"
482 id="rect8562-3"
483 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
484 <rect
485 y="148.53178"
486 x="81.918106"
487 height="5.0781121"
488 width="14.165285"
489 id="rect8564-4"
490 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
491 <flowRoot
492 transform="matrix(0.26458333,0,0,0.25571898,57.863852,37.194863)"
493 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
494 id="flowRoot8566-4"
495 xml:space="preserve"><flowRegion
496 id="flowRegion8568-01"><rect
497 y="468.95096"
498 x="92.934036"
499 height="81.405365"
500 width="182.83763"
501 id="rect8570-3" /></flowRegion><flowPara
502 id="flowPara8572-7">site 00002</flowPara><flowPara
503 id="flowPara8574-4">seedURLs + url filters</flowPara></flowRoot> </g>
504 <path
505 style="fill:none;stroke:#000000;stroke-width:0.62406325px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2)"
[33856]506 d="m 79.313097,141.42609 c -0.54085,-18.54837 1.367957,-18.54837 1.367957,-18.54837"
[33840]507 id="path6644-5"
508 inkscape:connector-curvature="0" />
509 <path
510 style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8)"
[33856]511 d="m 92.07873,112.40555 c 17.09134,-10.48227 18.03057,-8.85549 18.03057,-8.85549"
[33840]512 id="path6644-5-4"
513 inkscape:connector-curvature="0" />
514 <path
515 style="fill:none;stroke:#000000;stroke-width:0.59350747px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9)"
[33856]516 d="m 79.129233,111.17354 c -0.532249,-17.047571 1.3462,-17.047571 1.3462,-17.047571"
[33840]517 id="path6644-5-5"
518 inkscape:connector-curvature="0" />
519 <flowRoot
520 xml:space="preserve"
521 id="flowRoot10549"
[33841]522 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
[33856]523 transform="translate(-7.2808914,-48.099385)"><flowRegion
[33841]524 id="flowRegion10551"
525 style="stroke-width:0.26458332"><rect
[33840]526 id="rect10553"
[33841]527 width="77.10714"
528 height="14.741072"
529 x="119.0625"
530 y="146.94345"
531 style="stroke-width:0.07000434" /></flowRegion><flowPara
532 id="flowPara10555"
533 style="stroke-width:0.26458332">blacklist + greylist + whitelist +</flowPara><flowPara
534 id="flowPara10557"
535 style="stroke-width:0.26458332">sites needing custom handling</flowPara></flowRoot> <g
536 id="g14180"
[33856]537 transform="translate(-4.3357614,16.135192)">
[33840]538 <g
[33841]539 transform="translate(8.0502882,-22.849676)"
540 id="g8797">
[33840]541 <path
[33841]542 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
[33840]543 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
[33841]544 id="path8783"
[33840]545 inkscape:connector-curvature="0" />
546 <path
[33841]547 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
[33840]548 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
[33841]549 id="path8785"
[33840]550 inkscape:connector-curvature="0" />
551 <path
[33841]552 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
[33840]553 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
[33841]554 id="path8787"
[33840]555 inkscape:connector-curvature="0" />
556 <path
[33841]557 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
[33840]558 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
[33841]559 id="path8791"
[33840]560 inkscape:connector-curvature="0" />
561 </g>
[33841]562 <g
563 transform="translate(89.92074,-16.398799)"
564 id="g8854">
565 <path
566 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
567 d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
568 id="path8783-9"
569 inkscape:connector-curvature="0" />
570 <path
571 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
572 d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0"
573 id="path8785-2"
574 inkscape:connector-curvature="0" />
575 <path
576 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
577 d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0"
578 id="path8787-3"
579 inkscape:connector-curvature="0" />
580 <path
581 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
582 d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
583 id="path8791-1"
584 inkscape:connector-curvature="0" />
585 </g>
586 <g
587 style="fill:#ffffff;fill-opacity:0.98412697"
588 transform="translate(46.537087,-15.339242)"
589 id="g8797-3">
590 <g
591 style="fill:#ffffff;fill-opacity:0.98412697"
592 transform="translate(0,2.6458334)"
593 id="g8844">
594 <path
595 inkscape:connector-curvature="0"
596 id="path8783-7"
597 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
598 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
599 <path
600 inkscape:connector-curvature="0"
601 id="path8785-5"
602 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
603 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
604 <path
605 inkscape:connector-curvature="0"
606 id="path8787-8"
607 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
608 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
609 <path
610 inkscape:connector-curvature="0"
611 id="path8791-4"
612 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
613 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
614 </g>
615 </g>
616 <g
617 style="fill:#ffffff;fill-opacity:0.98412697"
618 transform="translate(61.771447,-9.9938385)"
619 id="g8797-3-1">
620 <g
621 style="fill:#ffffff;fill-opacity:0.98412697"
622 transform="translate(0,2.6458334)"
623 id="g8844-6">
624 <path
625 inkscape:connector-curvature="0"
626 id="path8783-7-3"
627 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
628 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
629 <path
630 inkscape:connector-curvature="0"
631 id="path8785-5-7"
632 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
633 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
634 <path
635 inkscape:connector-curvature="0"
636 id="path8787-8-3"
637 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
638 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
639 <path
640 inkscape:connector-curvature="0"
641 id="path8791-4-2"
642 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
643 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
644 </g>
645 </g>
[33840]646 </g>
647 <flowRoot
648 xml:space="preserve"
649 id="flowRoot10757"
650 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"
[33856]651 transform="matrix(0.26458333,0,0,0.26458333,2.4316686,-20.799115)"><flowRegion
[33840]652 id="flowRegion10759"
653 style="text-align:center;text-anchor:middle"><rect
654 id="rect10761"
655 width="253.39339"
656 height="35.589188"
657 x="145.71428"
658 y="509.66254"
659 style="text-align:center;text-anchor:middle" /></flowRegion><flowPara
[33856]660 id="flowPara10763">CCWETProcessor.java</flowPara></flowRoot> <g
[33840]661 id="g12584"
[33856]662 transform="translate(32.724113,-23.619145)">
[33840]663 <g
664 transform="translate(59.46747,-6.5481034)"
665 id="g12546">
666 <ellipse
667 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
668 id="path12440"
669 cx="66.523811"
670 cy="232.36606"
671 rx="13.985119"
672 ry="4.5357141" />
673 <path
674 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
675 d="m 80.473281,254.62108 c 1e-5,2.50501 -6.261304,4.53572 -13.985039,4.53572 -7.723738,0 -13.985059,-2.03071 -13.985049,-4.53572"
676 id="path12532"
677 inkscape:connector-curvature="0"
678 sodipodi:nodetypes="csc" />
679 <path
680 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
681 d="m 52.538693,232.36606 -0.0355,22.25502"
682 id="path12538"
683 inkscape:connector-curvature="0" />
684 <path
685 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
686 d="m 80.508929,232.36606 -0.03565,22.25502"
687 id="path12540"
688 inkscape:connector-curvature="0" />
689 </g>
690 <flowRoot
691 transform="matrix(0.26458333,0,0,0.26458333,1.3363477,3.2072344)"
692 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
693 id="flowRoot12564"
694 xml:space="preserve"><flowRegion
695 id="flowRegion12566"><rect
696 y="876.54755"
697 x="431.33514"
698 height="57.07362"
699 width="86.873116"
700 id="rect12568" /></flowRegion><flowPara
701 id="flowPara12570">Database</flowPara><flowPara
702 id="flowPara12572" /></flowRoot> </g>
703 <path
[33841]704 style="fill:none;stroke:#000000;stroke-width:0.61500657px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2)"
[33856]705 d="m 158.71539,200.26634 c -0.49158,-19.81955 1.24333,-19.81955 1.24333,-19.81955"
[33841]706 id="path6644-5-5-7"
707 inkscape:connector-curvature="0" />
708 <path
709 style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8)"
[33856]710 d="m 107.42299,211.64412 c 30.18095,-0.52693 30.18093,1.33273 30.18093,1.33273"
[33840]711 id="path6644-5-4-0"
712 inkscape:connector-curvature="0" />
[33856]713 <g
[33840]714 id="g13297"
[33856]715 transform="translate(6.7644786,-26.026385)">
[33840]716 <g
717 transform="translate(-80.712161,97.488904)"
718 id="g8797-1">
719 <path
720 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
721 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
722 id="path8783-90"
723 inkscape:connector-curvature="0" />
724 <path
725 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
726 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
727 id="path8785-9"
728 inkscape:connector-curvature="0" />
729 <path
730 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
731 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
732 id="path8787-1"
733 inkscape:connector-curvature="0" />
734 <path
735 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
736 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
737 id="path8791-3"
738 inkscape:connector-curvature="0" />
739 </g>
740 <g
741 transform="translate(1.1582912,103.93979)"
742 id="g8854-6">
743 <path
744 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
745 d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
746 id="path8783-9-9"
747 inkscape:connector-curvature="0" />
748 <path
749 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
750 d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0"
751 id="path8785-2-7"
752 inkscape:connector-curvature="0" />
753 <path
754 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
755 d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0"
756 id="path8787-3-9"
757 inkscape:connector-curvature="0" />
758 <path
759 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
760 d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
761 id="path8791-1-6"
762 inkscape:connector-curvature="0" />
763 </g>
764 <g
765 style="fill:#ffffff;fill-opacity:0.98412697"
766 transform="translate(-42.225361,104.99936)"
767 id="g8797-3-0">
768 <g
769 style="fill:#ffffff;fill-opacity:0.98412697"
770 transform="translate(0,2.6458334)"
771 id="g8844-65">
772 <path
773 inkscape:connector-curvature="0"
774 id="path8783-7-8"
775 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
776 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
777 <path
778 inkscape:connector-curvature="0"
779 id="path8785-5-9"
780 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
781 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
782 <path
783 inkscape:connector-curvature="0"
784 id="path8787-8-2"
785 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
786 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
787 <path
788 inkscape:connector-curvature="0"
789 id="path8791-4-8"
790 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
791 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
792 </g>
793 </g>
794 <rect
795 y="211.87975"
796 x="17.764881"
797 height="45.357147"
798 width="76.729172"
799 id="rect13101"
800 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
801 <flowRoot
802 transform="matrix(0.26458333,0,0,0.26458333,-1.5874999,1.5875)"
803 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
804 id="flowRoot13127"
805 xml:space="preserve"><flowRegion
806 id="flowRegion13129"><rect
807 y="893.01202"
808 x="79.814285"
809 height="48.079132"
810 width="87.32859"
811 id="rect13131" /></flowRegion><flowPara
812 id="flowPara13133">00001.txt</flowPara></flowRoot> <flowRoot
813 transform="matrix(0.26458333,0,0,0.26458333,43.353972,11.524134)"
814 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
815 id="flowRoot13127-3"
816 xml:space="preserve"><flowRegion
817 id="flowRegion13129-7"><rect
818 y="893.01202"
819 x="79.814285"
820 height="48.079132"
821 width="87.32859"
822 id="rect13131-3" /></flowRegion><flowPara
823 id="flowPara13133-0">014##.txt</flowPara></flowRoot> <flowRoot
824 transform="matrix(0.26458333,0,0,0.26458333,9.5250005,1.0583333)"
825 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
826 id="flowRoot13191"
827 xml:space="preserve"><flowRegion
828 id="flowRegion13193"><rect
829 y="912.20496"
830 x="152.54521"
831 height="37.176605"
832 width="91.12677"
833 id="rect13195" /></flowRegion><flowPara
834 id="flowPara13197">...</flowPara></flowRoot> <rect
835 y="205.83212"
836 x="17.764881"
837 height="6.0476165"
838 width="15.497023"
839 id="rect13199"
840 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
841 </g>
[33841]842 <g
843 id="g12584-4"
[33856]844 transform="translate(-85.795971,34.946485)">
[33841]845 <g
846 transform="translate(59.46747,-6.5481034)"
847 id="g12546-1">
848 <ellipse
849 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
850 id="path12440-2"
851 cx="66.523811"
852 cy="232.36606"
853 rx="13.985119"
854 ry="4.5357141" />
855 <path
856 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
857 d="m 80.473281,254.62108 c 1e-5,2.50501 -6.261304,4.53572 -13.985039,4.53572 -7.723738,0 -13.985059,-2.03071 -13.985049,-4.53572"
858 id="path12532-6"
859 inkscape:connector-curvature="0"
860 sodipodi:nodetypes="csc" />
861 <path
862 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
863 d="m 52.538693,232.36606 -0.0355,22.25502"
864 id="path12538-3"
865 inkscape:connector-curvature="0" />
866 <path
867 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
868 d="m 80.508929,232.36606 -0.03565,22.25502"
869 id="path12540-5"
870 inkscape:connector-curvature="0" />
871 </g>
872 <flowRoot
873 transform="matrix(0.26458333,0,0,0.26458333,1.3363477,3.2072344)"
874 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
875 id="flowRoot12564-1"
876 xml:space="preserve"><flowRegion
877 id="flowRegion12566-0"><rect
878 y="876.54755"
879 x="431.33514"
880 height="57.07362"
881 width="86.873116"
882 id="rect12568-8" /></flowRegion><flowPara
883 id="flowPara12570-9">MongoDB</flowPara><flowPara
884 id="flowPara12572-4" /></flowRoot> </g>
885 <text
886 xml:space="preserve"
887 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
[33856]888 x="125.82693"
889 y="186.3181"
[33841]890 id="text14721"><tspan
891 sodipodi:role="line"
892 id="tspan14719"
[33856]893 x="125.82693"
894 y="186.3181"
[33841]895 style="stroke-width:0.26458332">Crawl with </tspan><tspan
896 sodipodi:role="line"
[33856]897 x="125.82693"
898 y="192.4917"
[33841]899 style="stroke-width:0.26458332"
900 id="tspan15630">Apache Nutch</tspan></text>
901 <path
902 style="fill:none;stroke:#000000;stroke-width:0.62741137px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2-8)"
[33856]903 d="m 38.819713,258.57246 c -0.418819,-24.21071 1.059299,-24.21071 1.059299,-24.21071"
[33841]904 id="path6644-5-5-7-9"
905 inkscape:connector-curvature="0" />
906 <flowRoot
907 xml:space="preserve"
908 id="flowRoot15304"
909 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
[33856]910 transform="matrix(0.26458333,0,0,0.26458333,-11.756351,-2.2026872)"><flowRegion
[33841]911 id="flowRegion15306"><rect
912 id="rect15308"
913 width="147.14287"
914 height="54.285732"
915 x="444.28571"
916 y="833.94824" /></flowRegion><flowPara
917 id="flowPara15310">get text dump of</flowPara><flowPara
918 id="flowPara15314">each crawled site</flowPara></flowRoot> <flowRoot
919 xml:space="preserve"
920 id="flowRoot10757-0"
921 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
[33856]922 transform="matrix(0.26458333,0,0,0.26458333,7.1679686,102.72802)"><flowRegion
[33841]923 id="flowRegion10759-1"
924 style="text-align:start;text-anchor:start"><rect
925 id="rect10761-2"
926 width="360.54208"
927 height="74.783302"
928 x="145.71428"
929 y="509.66254"
930 style="text-align:start;text-anchor:start" /></flowRegion><flowPara
931 id="flowPara10763-6">NutchTextDumpToMongoDB.java</flowPara><flowPara
932 id="flowPara15752">- compute + store site and page level meta</flowPara><flowPara
[33856]933 id="flowPara15748">- store full text per web page</flowPara></flowRoot> <ellipse
[33841]934 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
935 id="path15809"
[33856]936 cx="141.89096"
937 cy="271.1738"
[33841]938 rx="41.766369"
939 ry="14.930058" />
940 <path
941 style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8-6)"
[33856]942 d="M 89.23779,272.13107 C 59.056841,271.60414 59.056862,273.4638 59.056862,273.4638"
[33841]943 id="path6644-5-4-0-2"
944 inkscape:connector-curvature="0" />
945 <flowRoot
[33856]946 transform="matrix(0.26458333,0,0,0.26458333,-2.6849314,-0.31280524)"
[33841]947 xml:space="preserve"
948 id="flowRoot16118"
[33856]949 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"><flowRegion
[33841]950 id="flowRegion16120"
951 style="text-align:center;text-anchor:middle"><rect
952 id="rect16122"
953 width="222.85715"
954 height="55.714287"
955 x="434.28571"
956 y="996.80542"
957 style="text-align:center;text-anchor:middle" /></flowRegion><flowPara
958 id="flowPara16124">Filter</flowPara><flowPara
959 id="flowPara16126">with MongoDB queries</flowPara></flowRoot> </g>
[33840]960</svg>
Note: See TracBrowser for help on using the repository browser.