Changeset 33841
- Timestamp:
- 2020-01-16T21:23:09+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/journal-paper/CommonCrawl_flow.svg
r33840 r33841 16 16 id="svg8" 17 17 inkscape:version="0.92.4 (5da689c313, 2019-01-14)" 18 sodipodi:docname="CommonCrawl_flow .svg">18 sodipodi:docname="CommonCrawl_flow2.svg"> 19 19 <defs 20 20 id="defs2"> … … 45 45 stdDeviation="0.59036602" 46 46 id="feGaussianBlur6146" /> 47 </filter>48 <filter49 inkscape:collect="always"50 style="color-interpolation-filters:sRGB"51 id="filter6144-7"52 x="-0.014400269"53 width="1.0288005"54 y="-0.010285577"55 height="1.0205712">56 <feGaussianBlur57 inkscape:collect="always"58 stdDeviation="0.59036602"59 id="feGaussianBlur6146-5" />60 </filter>61 <filter62 inkscape:collect="always"63 style="color-interpolation-filters:sRGB"64 id="filter6144-7-0"65 x="-0.014400269"66 width="1.0288005"67 y="-0.010285577"68 height="1.0205712">69 <feGaussianBlur70 inkscape:collect="always"71 stdDeviation="0.59036602"72 id="feGaussianBlur6146-5-1" />73 47 </filter> 74 48 <marker … … 147 121 transform="matrix(1.1,0,0,1.1,1.1,0)" /> 148 122 </marker> 123 <marker 124 inkscape:stockid="Arrow2Lstart" 125 orient="auto" 126 refY="0" 127 refX="0" 128 id="Arrow2Lstart-2-9-2-8" 129 style="overflow:visible" 130 inkscape:isstock="true"> 131 <path 132 inkscape:connector-curvature="0" 133 id="path6664-1-1-3-1" 134 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" 135 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 136 transform="matrix(1.1,0,0,1.1,1.1,0)" /> 137 </marker> 138 <marker 139 inkscape:stockid="Arrow2Lstart" 140 orient="auto" 141 refY="0" 142 refX="0" 143 id="Arrow2Lstart-2-8-8-6" 144 style="overflow:visible" 145 inkscape:isstock="true"> 146 <path 147 inkscape:connector-curvature="0" 148 id="path6664-1-8-8-0" 149 style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" 150 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 151 transform="matrix(1.1,0,0,1.1,1.1,0)" /> 152 </marker> 149 153 </defs> 150 154 <sodipodi:namedview … … 156 160 inkscape:pageshadow="2" 157 161 inkscape:zoom="0.7" 158 inkscape:cx=" 375.52817"159 inkscape:cy=" 697.479"162 inkscape:cx="281.24246" 163 inkscape:cy="703.19328" 160 164 inkscape:document-units="mm" 161 inkscape:current-layer="g 5994"165 inkscape:current-layer="g6386" 162 166 showgrid="false" 163 167 inkscape:snap-text-baseline="true" … … 176 180 <dc:type 177 181 rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 178 <dc:title ></dc:title>182 <dc:title /> 179 183 </cc:Work> 180 184 </rdf:RDF> … … 203 207 x="36.663689" 204 208 y="33.804573" 205 style="stroke-width:0.26458332" ></tspan><tspan209 style="stroke-width:0.26458332" /><tspan 206 210 sodipodi:role="line" 207 211 x="36.663689" … … 230 234 x="410" 231 235 y="195.37683" /></flowRegion><flowPara 232 id="flowPara6098" ></flowPara></flowRoot> <flowRoot236 id="flowPara6098" /></flowRoot> <flowRoot 233 237 xml:space="preserve" 234 238 id="flowRoot6100" … … 240 244 x="405.71429" 241 245 y="189.66254" /></flowRegion><flowPara 242 id="flowPara6106" ></flowPara></flowRoot> <flowRoot246 id="flowPara6106" /></flowRoot> <flowRoot 243 247 xml:space="preserve" 244 248 id="flowRoot6108" … … 250 254 x="407.14285" 251 255 y="193.94826" /></flowRegion><flowPara 252 id="flowPara6114" ></flowPara></flowRoot> <g256 id="flowPara6114" /></flowRoot> <g 253 257 id="g6386" 254 258 transform="translate(-3.8146973e-6)"> 255 <rect256 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"257 id="rect5954"258 width="26.947613"259 height="46.602379"260 x="36.174416"261 y="8.9821434" />262 259 <flowRoot 263 260 transform="matrix(0.26458333,0,0,0.26458333,-33.04705,-17.589917)" … … 273 270 width="107.14282" 274 271 id="rect6084" /></flowRegion><flowPara 275 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial"276 id="flowPara6086">CC Sep 2018</flowPara><flowPara277 272 style="font-size:24px;line-height:1" 278 id="flowPara6088"><flowSpan 279 id="flowSpan6118" 280 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara 281 id="flowPara6090" /></flowRoot> <rect 282 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 283 id="rect5954-3" 284 width="26.947613" 285 height="46.602379" 286 x="71.483315" 287 y="8.7612762" /> 288 <flowRoot 289 transform="matrix(0.26458333,0,0,0.26458333,2.9571308,-17.83785)" 290 style="font-style:normal;font-weight:normal;font-size:40px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;filter:url(#filter6144-7)" 291 id="flowRoot6080-6" 292 xml:space="preserve"><flowRegion 293 style="line-height:1" 294 id="flowRegion6082-9"><rect 295 style="line-height:1" 296 y="106.8054" 297 x="262.85715" 298 height="208.57144" 299 width="107.14282" 300 id="rect6084-7" /></flowRegion><flowPara 301 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial" 302 id="flowPara6086-2">CC Oct 2018</flowPara><flowPara 303 style="font-size:24px;line-height:1" 304 id="flowPara6088-8"><flowSpan 305 id="flowSpan6118-9" 306 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara 307 id="flowPara6090-2" /></flowRoot> <flowRoot 308 transform="matrix(0.26458333,0,0,0.26458333,20.323189,-24.166702)" 309 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 310 id="flowRoot6219" 311 xml:space="preserve"><flowRegion 312 id="flowRegion6221"><rect 313 y="185.37683" 314 x="310" 315 height="37.142857" 316 width="54.285713" 317 id="rect6223" /></flowRegion><flowPara 318 id="flowPara6225">...</flowPara></flowRoot> <flowRoot 319 transform="matrix(0.26458333,0,0,0.26458333,48.314274,-18.592251)" 320 style="font-style:normal;font-weight:normal;font-size:40px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;filter:url(#filter6144-7-0)" 321 id="flowRoot6080-6-3" 322 xml:space="preserve"><flowRegion 323 style="line-height:1" 324 id="flowRegion6082-9-7"><rect 325 style="line-height:1" 326 y="106.8054" 327 x="262.85715" 328 height="208.57144" 329 width="107.14282" 330 id="rect6084-7-3" /></flowRegion><flowPara 331 id="flowPara6281" 332 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">CC Aug 2019</flowPara><flowPara 333 style="font-size:24px;line-height:1" 334 id="flowPara6088-8-3"><flowSpan 335 id="flowSpan6118-9-9" 336 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara 337 id="flowPara6090-2-8" /></flowRoot> <rect 338 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 339 id="rect5954-3-4" 340 width="26.947613" 341 height="46.602379" 342 x="116.46248" 343 y="8.7612762" /> 273 id="flowPara6088" /><flowPara 274 id="flowPara6090" /></flowRoot> <g 275 id="g13905"> 276 <rect 277 y="8.9821434" 278 x="36.174416" 279 height="32.630665" 280 width="26.947613" 281 id="rect5954" 282 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 283 <flowRoot 284 xml:space="preserve" 285 id="flowRoot6219" 286 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 287 transform="matrix(0.26458333,0,0,0.26458333,20.323189,-24.166702)"><flowRegion 288 id="flowRegion6221"><rect 289 id="rect6223" 290 width="54.285713" 291 height="37.142857" 292 x="310" 293 y="185.37683" /></flowRegion><flowPara 294 id="flowPara6225">...</flowPara></flowRoot> <flowRoot 295 transform="matrix(0.26458333,0,0,0.26458333,1.8898848,4.5357143)" 296 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 297 id="flowRoot13798" 298 xml:space="preserve"><flowRegion 299 id="flowRegion13800"><rect 300 y="33.948257" 301 x="136.72218" 302 height="106.18565" 303 width="94.70639" 304 id="rect13802" /></flowRegion><flowPara 305 id="flowPara13804">CC Sep 2018</flowPara><flowPara 306 id="flowPara13806">Columnar Index</flowPara><flowPara 307 id="flowPara13808" /></flowRoot> <rect 308 y="8.8441896" 309 x="69.302986" 310 height="32.630665" 311 width="26.947613" 312 id="rect5954-4" 313 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 314 <flowRoot 315 transform="matrix(0.26458333,0,0,0.26458333,35.018452,4.3977563)" 316 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 317 id="flowRoot13798-0" 318 xml:space="preserve"><flowRegion 319 id="flowRegion13800-2"><rect 320 y="33.948257" 321 x="136.72218" 322 height="106.18565" 323 width="94.70639" 324 id="rect13802-0" /></flowRegion><flowPara 325 id="flowPara13804-8">CC Oct 2018</flowPara><flowPara 326 id="flowPara13806-1">Columnar Index</flowPara><flowPara 327 id="flowPara13808-6" /></flowRoot> <rect 328 y="8.9197874" 329 x="113.52618" 330 height="32.630665" 331 width="26.947613" 332 id="rect5954-1" 333 style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 334 <flowRoot 335 transform="matrix(0.26458333,0,0,0.26458333,79.241667,4.4733578)" 336 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 337 id="flowRoot13798-09" 338 xml:space="preserve"><flowRegion 339 id="flowRegion13800-0"><rect 340 y="33.948257" 341 x="136.72218" 342 height="106.18565" 343 width="94.70639" 344 id="rect13802-07" /></flowRegion><flowPara 345 id="flowPara13876">CC Aug 2019</flowPara><flowPara 346 id="flowPara13806-6">Columnar Index</flowPara><flowPara 347 id="flowPara13808-1" /></flowRoot> </g> 344 348 </g> 345 349 </g> … … 354 358 x="154.28572" 355 359 y="568.23395" /></flowRegion><flowPara 356 id="flowPara5976" ></flowPara></flowRoot> <flowRoot360 id="flowPara5976" /></flowRoot> <flowRoot 357 361 xml:space="preserve" 358 362 id="flowRoot5980" … … 364 368 x="1.4285715" 365 369 y="65.376831" /></flowRegion><flowPara 366 id="flowPara5986" ></flowPara></flowRoot> <flowRoot370 id="flowPara5986" /></flowRoot> <flowRoot 367 371 xml:space="preserve" 368 372 id="flowRoot6469" … … 374 378 x="90.714287" 375 379 y="612.51971" /></flowRegion><flowPara 376 id="flowPara6475" ></flowPara></flowRoot> <g380 id="flowPara6475" /></flowRoot> <g 377 381 id="g8700"> 378 <rect379 y="81.553574"380 x="20.788691"381 height="24.190477"382 width="51.026787"383 id="rect6388"384 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />385 <rect386 y="76.261909"387 x="20.788691"388 height="5.2916689"389 width="15.497024"390 id="rect6390"391 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />392 382 <flowRoot 393 383 transform="matrix(0.26458333,0,0,0.26458333,2.6458333,3.4017858)" … … 401 391 width="192.85715" 402 392 id="rect6396" /></flowRegion><flowPara 403 id="flowPara6400">*.warc.wet files</flowPara><flowPara 404 id="flowPara6404">CC Sep 2018</flowPara></flowRoot> <g 405 transform="translate(-32.883929,-20.197169)" 406 id="g6579"> 393 id="flowPara6404" /></flowRoot> <g 394 id="g14156" 395 transform="translate(0,-14.81667)"> 407 396 <rect 408 style="opacity:1;fill:# ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"409 id="rect6388 -2"397 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 398 id="rect6388" 410 399 width="51.026787" 411 400 height="24.190477" 412 x=" 99.974701"413 y=" 103.47619" />401 x="20.788691" 402 y="81.553574" /> 414 403 <rect 415 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 416 id="rect6390-7" 417 width="15.497025" 418 height="5.2916694" 419 x="99.974701" 420 y="98.184525" /> 404 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 405 id="rect6390" 406 width="15.497024" 407 height="5.2916689" 408 x="20.788691" 409 y="76.261909" /> 410 <g 411 id="g6579" 412 transform="translate(-32.883929,-20.197169)"> 413 <rect 414 y="103.47619" 415 x="99.974701" 416 height="24.190477" 417 width="51.026787" 418 id="rect6388-2" 419 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 420 <rect 421 y="98.184525" 422 x="99.974701" 423 height="5.2916694" 424 width="15.497025" 425 id="rect6390-7" 426 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 427 </g> 428 <g 429 id="g6630" 430 transform="translate(61.241527,-46.849824)"> 431 <rect 432 y="131.19792" 433 x="51.460232" 434 height="24.190477" 435 width="51.026787" 436 id="rect6388-8-4" 437 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" /> 438 <rect 439 y="125.90625" 440 x="51.460232" 441 height="5.2916694" 442 width="15.497025" 443 id="rect6390-9-4" 444 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" /> 445 </g> 446 <g 447 id="g6591" 448 transform="translate(21.43299,-58.018398)"> 449 <rect 450 y="143.1637" 451 x="111.31399" 452 height="24.190477" 453 width="51.026787" 454 id="rect6388-8" 455 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 456 <rect 457 y="137.87202" 458 x="111.31399" 459 height="5.2916694" 460 width="15.497025" 461 id="rect6390-9" 462 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> 463 </g> 421 464 <flowRoot 422 xml:space="preserve" 423 id="flowRoot6392-0" 424 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 425 transform="matrix(0.26458333,0,0,0.26458333,81.831845,25.3244)"><flowRegion 426 id="flowRegion6394-8"><rect 427 id="rect6396-9" 428 width="192.85715" 429 height="91.428543" 465 transform="matrix(0.26458333,0,0,0.26458333,4.9136906,4.8380953)" 466 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 467 id="flowRoot14053" 468 xml:space="preserve"><flowRegion 469 id="flowRegion14055"><rect 470 y="308.23401" 430 471 x="78.571426" 431 y="308.23401" /></flowRegion><flowPara 432 id="flowPara6400-8">*.warc.wet files</flowPara><flowPara 433 id="flowPara6404-1">CC Oct 2018</flowPara></flowRoot> </g> 434 <g 435 transform="translate(61.241527,-46.849824)" 436 id="g6630"> 437 <rect 438 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" 439 id="rect6388-8-4" 440 width="51.026787" 441 height="24.190477" 442 x="51.460232" 443 y="131.19792" /> 444 <rect 445 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" 446 id="rect6390-9-4" 447 width="15.497025" 448 height="5.2916694" 449 x="51.460232" 450 y="125.90625" /> 451 </g> 452 <g 453 transform="translate(21.43299,-58.018398)" 454 id="g6591"> 455 <rect 456 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 457 id="rect6388-8" 458 width="51.026787" 459 height="24.190477" 460 x="111.31399" 461 y="143.1637" /> 462 <rect 463 style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" 464 id="rect6390-9" 465 width="15.497025" 466 height="5.2916694" 467 x="111.31399" 468 y="137.87202" /> 469 <text 470 xml:space="preserve" 471 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.3499999px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" 472 x="115.66071" 473 y="151.85715" 474 id="text6479"><tspan 475 sodipodi:role="line" 476 id="tspan6477" 477 x="115.66071" 478 y="151.85715" 479 style="stroke-width:0.26458332">*.warc.wet files</tspan><tspan 480 sodipodi:role="line" 481 x="115.66071" 482 y="159.79465" 483 style="stroke-width:0.26458332" 484 id="tspan6481">CC Aug 2019</tspan></text> 485 </g> 472 height="52.235638" 473 width="142.14285" 474 id="rect14057" /></flowRegion><flowPara 475 id="flowPara14059">*.warc.wet files</flowPara><flowPara 476 id="flowPara14061">CC Sep 2018</flowPara></flowRoot> <flowRoot 477 transform="matrix(0.26458333,0,0,0.26458333,50.006029,7.3307912)" 478 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 479 id="flowRoot14053-6" 480 xml:space="preserve"><flowRegion 481 id="flowRegion14055-5"><rect 482 y="308.23401" 483 x="78.571426" 484 height="52.235638" 485 width="142.14285" 486 id="rect14057-1" /></flowRegion><flowPara 487 id="flowPara14059-3">*.warc.wet files</flowPara><flowPara 488 id="flowPara14061-0">CC Oct 2018</flowPara></flowRoot> <flowRoot 489 transform="matrix(0.26458333,0,0,0.26458333,114.78021,8.3683492)" 490 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 491 id="flowRoot14053-6-6" 492 xml:space="preserve"><flowRegion 493 id="flowRegion14055-5-5"><rect 494 y="308.23401" 495 x="78.571426" 496 height="52.235638" 497 width="142.14285" 498 id="rect14057-1-8" /></flowRegion><flowPara 499 id="flowPara14059-3-3">*.warc.wet files</flowPara><flowPara 500 id="flowPara14061-0-1">CC Aug 2019</flowPara></flowRoot> </g> 486 501 </g> 487 502 <path 488 503 style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart)" 489 d="M 91.563371, 76.232879 C 91.031116,56.190155 92.909569,56.190155 92.909569,56.190155"504 d="M 91.563371,63.532871 C 91.031116,43.490147 92.909569,43.490147 92.909569,43.490147" 490 505 id="path6644" 491 506 inkscape:connector-curvature="0" /> … … 500 515 x="169.53333" 501 516 y="66.803123" 502 style="stroke-width:0.26458332" ></tspan><tspan517 style="stroke-width:0.26458332" /><tspan 503 518 sodipodi:role="line" 504 519 x="169.53333" … … 526 541 x="372.02237" 527 542 y="209.24846" /></flowRegion><flowPara 528 id="flowPara8534" ></flowPara></flowRoot> <flowRoot543 id="flowPara8534" /></flowRoot> <flowRoot 529 544 xml:space="preserve" 530 545 id="flowRoot8538" … … 541 556 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" 542 557 x="126.03919" 543 y=" 63.406425"558 y="49.118912" 544 559 id="text8548"><tspan 545 560 sodipodi:role="line" 546 561 x="126.03919" 547 y=" 63.406425"562 y="49.118912" 548 563 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332" 549 564 id="tspan8550">content_languages = 'mri'</tspan><tspan 550 565 sodipodi:role="line" 551 566 x="126.03919" 552 y=" 68.345314"567 y="54.0578" 553 568 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332" 554 569 id="tspan8558">+</tspan><tspan 555 570 sodipodi:role="line" 556 571 x="126.03919" 557 y=" 73.284203"572 y="58.996689" 558 573 style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332" 559 574 id="tspan8560">warc to wet</tspan></text> 560 575 <flowRoot 561 transform="matrix(0.26458333,0,0,0.26458333, 38.922917,130.28623)"576 transform="matrix(0.26458333,0,0,0.26458333,43.156253,110.17777)" 562 577 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 563 578 id="flowRoot6219-8" … … 571 586 id="flowPara6225-5">...</flowPara></flowRoot> <g 572 587 id="g8667" 573 transform="translate(- 8.3391904,40.839482)">588 transform="translate(-4.1058568,21.78947)"> 574 589 <rect 575 590 y="124.0766" … … 601 616 <g 602 617 id="g8709" 603 transform="translate( 4.7141497,49.968462)">618 transform="translate(8.9474833,31.976784)"> 604 619 <g 605 620 id="g8744"> … … 634 649 <g 635 650 id="g8658" 636 transform="translate(- 14.085366,17.548923)">651 transform="translate(-9.8520303,-1.5010822)"> 637 652 <rect 638 653 y="153.60989" … … 662 677 id="flowPara8572-7">site 00002</flowPara><flowPara 663 678 id="flowPara8574-4">seedURLs + url filters</flowPara></flowRoot> </g> 664 <g665 id="g8797"666 transform="translate(8.0502882,-6.9746679)">667 <path668 inkscape:connector-curvature="0"669 id="path8783"670 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"671 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />672 <path673 inkscape:connector-curvature="0"674 id="path8785"675 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"676 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />677 <path678 inkscape:connector-curvature="0"679 id="path8787"680 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"681 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />682 <path683 inkscape:connector-curvature="0"684 id="path8791"685 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"686 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />687 </g>688 <g689 id="g8854"690 transform="translate(89.92074,-0.52379428)">691 <path692 inkscape:connector-curvature="0"693 id="path8783-9"694 d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"695 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />696 <path697 inkscape:connector-curvature="0"698 id="path8785-2"699 d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0"700 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />701 <path702 inkscape:connector-curvature="0"703 id="path8787-3"704 d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0"705 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />706 <path707 inkscape:connector-curvature="0"708 id="path8791-1"709 d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"710 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />711 </g>712 <g713 id="g8797-3"714 transform="translate(46.537087,0.53576233)"715 style="fill:#ffffff;fill-opacity:0.98412697">716 <g717 id="g8844"718 transform="translate(0,2.6458334)"719 style="fill:#ffffff;fill-opacity:0.98412697">720 <path721 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"722 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"723 id="path8783-7"724 inkscape:connector-curvature="0" />725 <path726 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"727 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"728 id="path8785-5"729 inkscape:connector-curvature="0" />730 <path731 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"732 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"733 id="path8787-8"734 inkscape:connector-curvature="0" />735 <path736 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"737 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"738 id="path8791-4"739 inkscape:connector-curvature="0" />740 </g>741 </g>742 679 <path 743 680 style="fill:none;stroke:#000000;stroke-width:0.62406325px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2)" 744 d="M 81.998023,1 62.37652 C 81.457172,143.82815 83.36598,143.82815 83.36598,143.82815"681 d="M 81.998023,141.73889 C 81.457172,123.19052 83.36598,123.19052 83.36598,123.19052" 745 682 id="path6644-5" 746 683 inkscape:connector-curvature="0" /> 747 684 <path 748 685 style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8)" 749 d=" m 105.88728,142.56554 c 19.8306,2.95621 19.50441,4.80612 19.50441,4.80612"686 d="M 94.763655,112.71836 C 111.855,102.23609 112.79422,103.86287 112.79422,103.86287" 750 687 id="path6644-5-4" 751 688 inkscape:connector-curvature="0" /> 752 689 <path 753 690 style="fill:none;stroke:#000000;stroke-width:0.59350747px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9)" 754 d="m 81.814159,1 27.36144 c -0.53225,-17.0476 1.3462,-17.0476 1.3462,-17.0476"691 d="m 81.814159,111.48634 c -0.53225,-17.047567 1.3462,-17.047567 1.3462,-17.047567" 755 692 id="path6644-5-5" 756 693 inkscape:connector-curvature="0" /> … … 758 695 xml:space="preserve" 759 696 id="flowRoot10549" 760 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 761 transform="matrix(0.26458333,0,0,0.26458333,0,2.1166667)"><flowRegion 762 id="flowRegion10551"><rect 697 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" 698 transform="translate(-4.5959653,-47.786576)"><flowRegion 699 id="flowRegion10551" 700 style="stroke-width:0.26458332"><rect 763 701 id="rect10553" 764 width="291.42856" 765 height="55.714287" 766 x="450" 767 y="555.37683" /></flowRegion><flowPara 768 id="flowPara10555">blacklist + greylist + whitelist +</flowPara><flowPara 769 id="flowPara10557">sites needing custom handling</flowPara></flowRoot> <g 770 id="g8797-3-1" 771 transform="translate(61.771447,5.8811623)" 772 style="fill:#ffffff;fill-opacity:0.98412697"> 702 width="77.10714" 703 height="14.741072" 704 x="119.0625" 705 y="146.94345" 706 style="stroke-width:0.07000434" /></flowRegion><flowPara 707 id="flowPara10555" 708 style="stroke-width:0.26458332">blacklist + greylist + whitelist +</flowPara><flowPara 709 id="flowPara10557" 710 style="stroke-width:0.26458332">sites needing custom handling</flowPara></flowRoot> <g 711 id="g14180" 712 transform="translate(-1.6508314,16.447997)"> 773 713 <g 774 id="g8844-6" 775 transform="translate(0,2.6458334)" 776 style="fill:#ffffff;fill-opacity:0.98412697"> 777 <path 778 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 714 transform="translate(8.0502882,-22.849676)" 715 id="g8797"> 716 <path 717 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 779 718 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z" 780 id="path8783 -7-3"781 inkscape:connector-curvature="0" /> 782 <path 783 style="fill: #ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"719 id="path8783" 720 inkscape:connector-curvature="0" /> 721 <path 722 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 784 723 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0" 785 id="path8785 -5-7"786 inkscape:connector-curvature="0" /> 787 <path 788 style="fill: #ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"724 id="path8785" 725 inkscape:connector-curvature="0" /> 726 <path 727 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 789 728 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0" 790 id="path8787 -8-3"791 inkscape:connector-curvature="0" /> 792 <path 793 style="fill: #ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"729 id="path8787" 730 inkscape:connector-curvature="0" /> 731 <path 732 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 794 733 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726" 795 id="path8791-4-2" 796 inkscape:connector-curvature="0" /> 734 id="path8791" 735 inkscape:connector-curvature="0" /> 736 </g> 737 <g 738 transform="translate(89.92074,-16.398799)" 739 id="g8854"> 740 <path 741 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 742 d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z" 743 id="path8783-9" 744 inkscape:connector-curvature="0" /> 745 <path 746 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 747 d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0" 748 id="path8785-2" 749 inkscape:connector-curvature="0" /> 750 <path 751 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 752 d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0" 753 id="path8787-3" 754 inkscape:connector-curvature="0" /> 755 <path 756 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 757 d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726" 758 id="path8791-1" 759 inkscape:connector-curvature="0" /> 760 </g> 761 <g 762 style="fill:#ffffff;fill-opacity:0.98412697" 763 transform="translate(46.537087,-15.339242)" 764 id="g8797-3"> 765 <g 766 style="fill:#ffffff;fill-opacity:0.98412697" 767 transform="translate(0,2.6458334)" 768 id="g8844"> 769 <path 770 inkscape:connector-curvature="0" 771 id="path8783-7" 772 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z" 773 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 774 <path 775 inkscape:connector-curvature="0" 776 id="path8785-5" 777 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0" 778 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 779 <path 780 inkscape:connector-curvature="0" 781 id="path8787-8" 782 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0" 783 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 784 <path 785 inkscape:connector-curvature="0" 786 id="path8791-4" 787 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726" 788 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 789 </g> 790 </g> 791 <g 792 style="fill:#ffffff;fill-opacity:0.98412697" 793 transform="translate(61.771447,-9.9938385)" 794 id="g8797-3-1"> 795 <g 796 style="fill:#ffffff;fill-opacity:0.98412697" 797 transform="translate(0,2.6458334)" 798 id="g8844-6"> 799 <path 800 inkscape:connector-curvature="0" 801 id="path8783-7-3" 802 d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z" 803 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 804 <path 805 inkscape:connector-curvature="0" 806 id="path8785-5-7" 807 d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0" 808 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 809 <path 810 inkscape:connector-curvature="0" 811 id="path8787-8-3" 812 d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0" 813 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 814 <path 815 inkscape:connector-curvature="0" 816 id="path8791-4-2" 817 d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726" 818 style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> 819 </g> 797 820 </g> 798 821 </g> … … 801 824 id="flowRoot10757" 802 825 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none" 803 transform="matrix(0.26458333,0,0,0.26458333,5.1165915,- 0.90713918)"><flowRegion826 transform="matrix(0.26458333,0,0,0.26458333,5.1165915,-20.486313)"><flowRegion 804 827 id="flowRegion10759" 805 828 style="text-align:center;text-anchor:middle"><rect … … 810 833 y="509.66254" 811 834 style="text-align:center;text-anchor:middle" /></flowRegion><flowPara 812 id="flowPara10763">CCWETProcessor.java</flowPara></flowRoot> <path 813 style="fill:none;stroke:#000000;stroke-width:0.58551782px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2)" 814 d="m 124.20743,213.25747 c -0.49158,-17.96448 1.24333,-17.96448 1.24333,-17.96448" 815 id="path6644-5-5-7" 816 inkscape:connector-curvature="0" /> 817 <flowRoot 835 id="flowPara10763">CCWETProcessor.java</flowPara></flowRoot> <flowRoot 818 836 xml:space="preserve" 819 837 id="flowRoot11618" … … 825 843 x="255.71428" 826 844 y="806.80542" /></flowRegion><flowPara 827 id="flowPara11624" ></flowPara></flowRoot> <flowRoot845 id="flowPara11624" /></flowRoot> <flowRoot 828 846 xml:space="preserve" 829 847 id="flowRoot11626" … … 835 853 x="364.28571" 836 854 y="835.37683" /></flowRegion><flowPara 837 id="flowPara11632" ></flowPara></flowRoot> <flowRoot855 id="flowPara11632" /></flowRoot> <flowRoot 838 856 xml:space="preserve" 839 857 id="flowRoot11636" 840 858 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 841 transform="matrix(0.26458333,0,0,0.26458333,41.665157,- 19.117346)"><flowRegion859 transform="matrix(0.26458333,0,0,0.26458333,41.665157,-34.992356)"><flowRegion 842 860 id="flowRegion11638"><rect 843 861 id="rect11640" … … 846 864 x="340" 847 865 y="826.80542" /></flowRegion><flowPara 848 id="flowPara11642" >Crawl with Apache Nutch</flowPara><flowPara866 id="flowPara11642" /><flowPara 849 867 id="flowPara11644" /></flowRoot> <rect 850 868 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:1.00157475;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" … … 867 885 y="875.03229" 868 886 style="fill:#000000;fill-opacity:0;" /></flowRegion><flowPara 869 id="flowPara12554" ></flowPara></flowRoot> <flowRoot887 id="flowPara12554" /></flowRoot> <flowRoot 870 888 xml:space="preserve" 871 889 id="flowRoot12556" … … 877 895 x="431.84021" 878 896 y="871.49677" /></flowRegion><flowPara 879 id="flowPara12562" ></flowPara></flowRoot> <g897 id="flowPara12562" /></flowRoot> <g 880 898 id="g12584" 881 transform="translate( 6.9121608,-2.1711027)">899 transform="translate(35.409039,-23.306338)"> 882 900 <g 883 901 transform="translate(59.46747,-6.5481034)" … … 921 939 id="flowPara12572" /></flowRoot> </g> 922 940 <path 923 style="fill:none;stroke:#000000;stroke-width:0.64151984px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8)" 924 d="m 96.775998,235.2001 c 20.118612,-0.52693 20.118602,1.33273 20.118602,1.33273" 941 style="fill:none;stroke:#000000;stroke-width:0.61500657px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2)" 942 d="m 161.40032,200.57914 c -0.49158,-19.81955 1.24333,-19.81955 1.24333,-19.81955" 943 id="path6644-5-5-7" 944 inkscape:connector-curvature="0" /> 945 <path 946 style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8)" 947 d="m 110.10792,211.95692 c 30.18095,-0.52693 30.18093,1.33273 30.18093,1.33273" 925 948 id="path6644-5-4-0" 926 949 inkscape:connector-curvature="0" /> 927 950 <flowRoot 928 xml:space="preserve"929 id="flowRoot12967"930 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"931 transform="matrix(0.26458333,0,0,0.26458333,-57.452381,56.318459)"><flowRegion932 id="flowRegion12969"933 style="text-align:center;text-anchor:middle"><rect934 id="rect12971"935 width="252.59795"936 height="45.616917"937 x="297.14285"938 y="771.09113"939 style="text-align:center;text-anchor:middle" /></flowRegion><flowPara940 id="flowPara12973">text dump per crawled site</flowPara></flowRoot> <flowRoot941 951 xml:space="preserve" 942 952 id="flowRoot13119" … … 948 958 x="79.814285" 949 959 y="893.01202" /></flowRegion><flowPara 950 id="flowPara13125" ></flowPara></flowRoot> <g960 id="flowPara13125" /></flowRoot> <g 951 961 id="g13297" 952 transform="translate( 0,1.0583333)">962 transform="translate(9.4494046,-25.713581)"> 953 963 <g 954 964 transform="translate(-80.712161,97.488904)" … … 1077 1087 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" /> 1078 1088 </g> 1079 </g> 1089 <g 1090 id="g12584-4" 1091 transform="translate(-83.111041,35.25929)"> 1092 <g 1093 transform="translate(59.46747,-6.5481034)" 1094 id="g12546-1"> 1095 <ellipse 1096 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" 1097 id="path12440-2" 1098 cx="66.523811" 1099 cy="232.36606" 1100 rx="13.985119" 1101 ry="4.5357141" /> 1102 <path 1103 style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" 1104 d="m 80.473281,254.62108 c 1e-5,2.50501 -6.261304,4.53572 -13.985039,4.53572 -7.723738,0 -13.985059,-2.03071 -13.985049,-4.53572" 1105 id="path12532-6" 1106 inkscape:connector-curvature="0" 1107 sodipodi:nodetypes="csc" /> 1108 <path 1109 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 1110 d="m 52.538693,232.36606 -0.0355,22.25502" 1111 id="path12538-3" 1112 inkscape:connector-curvature="0" /> 1113 <path 1114 style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 1115 d="m 80.508929,232.36606 -0.03565,22.25502" 1116 id="path12540-5" 1117 inkscape:connector-curvature="0" /> 1118 </g> 1119 <flowRoot 1120 transform="matrix(0.26458333,0,0,0.26458333,1.3363477,3.2072344)" 1121 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 1122 id="flowRoot12564-1" 1123 xml:space="preserve"><flowRegion 1124 id="flowRegion12566-0"><rect 1125 y="876.54755" 1126 x="431.33514" 1127 height="57.07362" 1128 width="86.873116" 1129 id="rect12568-8" /></flowRegion><flowPara 1130 id="flowPara12570-9">MongoDB</flowPara><flowPara 1131 id="flowPara12572-4" /></flowRoot> </g> 1132 <text 1133 xml:space="preserve" 1134 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" 1135 x="128.51186" 1136 y="186.63091" 1137 id="text14721"><tspan 1138 sodipodi:role="line" 1139 id="tspan14719" 1140 x="128.51186" 1141 y="186.63091" 1142 style="stroke-width:0.26458332">Crawl with </tspan><tspan 1143 sodipodi:role="line" 1144 x="128.51186" 1145 y="192.80452" 1146 style="stroke-width:0.26458332" 1147 id="tspan15630">Apache Nutch</tspan></text> 1148 <path 1149 style="fill:none;stroke:#000000;stroke-width:0.62741137px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2-8)" 1150 d="m 41.504639,258.88526 c -0.418819,-24.21071 1.059298,-24.21071 1.059298,-24.21071" 1151 id="path6644-5-5-7-9" 1152 inkscape:connector-curvature="0" /> 1153 <flowRoot 1154 xml:space="preserve" 1155 id="flowRoot15304" 1156 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 1157 transform="matrix(0.26458333,0,0,0.26458333,-9.0714287,-1.889881)"><flowRegion 1158 id="flowRegion15306"><rect 1159 id="rect15308" 1160 width="147.14287" 1161 height="54.285732" 1162 x="444.28571" 1163 y="833.94824" /></flowRegion><flowPara 1164 id="flowPara15310">get text dump of</flowPara><flowPara 1165 id="flowPara15314">each crawled site</flowPara></flowRoot> <flowRoot 1166 xml:space="preserve" 1167 id="flowRoot10757-0" 1168 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none" 1169 transform="matrix(0.26458333,0,0,0.26458333,9.8528943,103.04083)"><flowRegion 1170 id="flowRegion10759-1" 1171 style="text-align:start;text-anchor:start"><rect 1172 id="rect10761-2" 1173 width="360.54208" 1174 height="74.783302" 1175 x="145.71428" 1176 y="509.66254" 1177 style="text-align:start;text-anchor:start" /></flowRegion><flowPara 1178 id="flowPara10763-6">NutchTextDumpToMongoDB.java</flowPara><flowPara 1179 id="flowPara15752">- compute + store site and page level meta</flowPara><flowPara 1180 id="flowPara15748">- store full text per web page</flowPara></flowRoot> <flowRoot 1181 xml:space="preserve" 1182 id="flowRoot15734" 1183 style="fill:black;fill-opacity:1;stroke:none;font-family:Arial;font-style:normal;font-weight:normal;font-size:18.66666667px;line-height:1.25;letter-spacing:0px;word-spacing:0px;-inkscape-font-specification:'Arial, Normal';font-stretch:normal;font-variant:normal;text-anchor:start;text-align:start;writing-mode:lr;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"><flowRegion 1184 id="flowRegion15736"><rect 1185 id="rect15738" 1186 width="184.28572" 1187 height="52.312145" 1188 x="345.71429" 1189 y="944.49323" /></flowRegion><flowPara 1190 id="flowPara15740" /></flowRoot> <ellipse 1191 style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" 1192 id="path15809" 1193 cx="144.57588" 1194 cy="271.4866" 1195 rx="41.766369" 1196 ry="14.930058" /> 1197 <path 1198 style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8-6)" 1199 d="m 91.922716,272.44387 c -30.180951,-0.52693 -30.18093,1.33273 -30.18093,1.33273" 1200 id="path6644-5-4-0-2" 1201 inkscape:connector-curvature="0" /> 1202 <flowRoot 1203 xml:space="preserve" 1204 id="flowRoot16118" 1205 style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none" 1206 transform="scale(0.26458333)"><flowRegion 1207 id="flowRegion16120" 1208 style="text-align:center;text-anchor:middle"><rect 1209 id="rect16122" 1210 width="222.85715" 1211 height="55.714287" 1212 x="434.28571" 1213 y="996.80542" 1214 style="text-align:center;text-anchor:middle" /></flowRegion><flowPara 1215 id="flowPara16124">Filter</flowPara><flowPara 1216 id="flowPara16126">with MongoDB queries</flowPara></flowRoot> </g> 1080 1217 </svg>
Note:
See TracChangeset
for help on using the changeset viewer.