Changeset 33841


Ignore:
Timestamp:
2020-01-16T21:23:09+13:00 (4 years ago)
Author:
ak19
Message:

Latest version of the flowchart of the process of getting Common Crawl data into MongoDB as websites and webpages collections for querying

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/journal-paper/CommonCrawl_flow.svg

    r33840 r33841  
    1616   id="svg8"
    1717   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
    18    sodipodi:docname="CommonCrawl_flow.svg">
     18   sodipodi:docname="CommonCrawl_flow2.svg">
    1919  <defs
    2020     id="defs2">
     
    4545         stdDeviation="0.59036602"
    4646         id="feGaussianBlur6146" />
    47     </filter>
    48     <filter
    49        inkscape:collect="always"
    50        style="color-interpolation-filters:sRGB"
    51        id="filter6144-7"
    52        x="-0.014400269"
    53        width="1.0288005"
    54        y="-0.010285577"
    55        height="1.0205712">
    56       <feGaussianBlur
    57          inkscape:collect="always"
    58          stdDeviation="0.59036602"
    59          id="feGaussianBlur6146-5" />
    60     </filter>
    61     <filter
    62        inkscape:collect="always"
    63        style="color-interpolation-filters:sRGB"
    64        id="filter6144-7-0"
    65        x="-0.014400269"
    66        width="1.0288005"
    67        y="-0.010285577"
    68        height="1.0205712">
    69       <feGaussianBlur
    70          inkscape:collect="always"
    71          stdDeviation="0.59036602"
    72          id="feGaussianBlur6146-5-1" />
    7347    </filter>
    7448    <marker
     
    147121         transform="matrix(1.1,0,0,1.1,1.1,0)" />
    148122    </marker>
     123    <marker
     124       inkscape:stockid="Arrow2Lstart"
     125       orient="auto"
     126       refY="0"
     127       refX="0"
     128       id="Arrow2Lstart-2-9-2-8"
     129       style="overflow:visible"
     130       inkscape:isstock="true">
     131      <path
     132         inkscape:connector-curvature="0"
     133         id="path6664-1-1-3-1"
     134         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
     135         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
     136         transform="matrix(1.1,0,0,1.1,1.1,0)" />
     137    </marker>
     138    <marker
     139       inkscape:stockid="Arrow2Lstart"
     140       orient="auto"
     141       refY="0"
     142       refX="0"
     143       id="Arrow2Lstart-2-8-8-6"
     144       style="overflow:visible"
     145       inkscape:isstock="true">
     146      <path
     147         inkscape:connector-curvature="0"
     148         id="path6664-1-8-8-0"
     149         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
     150         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
     151         transform="matrix(1.1,0,0,1.1,1.1,0)" />
     152    </marker>
    149153  </defs>
    150154  <sodipodi:namedview
     
    156160     inkscape:pageshadow="2"
    157161     inkscape:zoom="0.7"
    158      inkscape:cx="375.52817"
    159      inkscape:cy="697.479"
     162     inkscape:cx="281.24246"
     163     inkscape:cy="703.19328"
    160164     inkscape:document-units="mm"
    161      inkscape:current-layer="g5994"
     165     inkscape:current-layer="g6386"
    162166     showgrid="false"
    163167     inkscape:snap-text-baseline="true"
     
    176180        <dc:type
    177181           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
    178         <dc:title></dc:title>
     182        <dc:title />
    179183      </cc:Work>
    180184    </rdf:RDF>
     
    203207           x="36.663689"
    204208           y="33.804573"
    205            style="stroke-width:0.26458332"></tspan><tspan
     209           style="stroke-width:0.26458332" /><tspan
    206210           sodipodi:role="line"
    207211           x="36.663689"
     
    230234             x="410"
    231235             y="195.37683" /></flowRegion><flowPara
    232            id="flowPara6098"></flowPara></flowRoot>      <flowRoot
     236           id="flowPara6098" /></flowRoot>      <flowRoot
    233237         xml:space="preserve"
    234238         id="flowRoot6100"
     
    240244             x="405.71429"
    241245             y="189.66254" /></flowRegion><flowPara
    242            id="flowPara6106"></flowPara></flowRoot>      <flowRoot
     246           id="flowPara6106" /></flowRoot>      <flowRoot
    243247         xml:space="preserve"
    244248         id="flowRoot6108"
     
    250254             x="407.14285"
    251255             y="193.94826" /></flowRegion><flowPara
    252            id="flowPara6114"></flowPara></flowRoot>      <g
     256           id="flowPara6114" /></flowRoot>      <g
    253257         id="g6386"
    254258         transform="translate(-3.8146973e-6)">
    255         <rect
    256            style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    257            id="rect5954"
    258            width="26.947613"
    259            height="46.602379"
    260            x="36.174416"
    261            y="8.9821434" />
    262259        <flowRoot
    263260           transform="matrix(0.26458333,0,0,0.26458333,-33.04705,-17.589917)"
     
    273270               width="107.14282"
    274271               id="rect6084" /></flowRegion><flowPara
    275              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial"
    276              id="flowPara6086">CC Sep 2018</flowPara><flowPara
    277272             style="font-size:24px;line-height:1"
    278              id="flowPara6088"><flowSpan
    279    id="flowSpan6118"
    280    style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara
    281              id="flowPara6090" /></flowRoot>        <rect
    282            style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    283            id="rect5954-3"
    284            width="26.947613"
    285            height="46.602379"
    286            x="71.483315"
    287            y="8.7612762" />
    288         <flowRoot
    289            transform="matrix(0.26458333,0,0,0.26458333,2.9571308,-17.83785)"
    290            style="font-style:normal;font-weight:normal;font-size:40px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;filter:url(#filter6144-7)"
    291            id="flowRoot6080-6"
    292            xml:space="preserve"><flowRegion
    293              style="line-height:1"
    294              id="flowRegion6082-9"><rect
    295                style="line-height:1"
    296                y="106.8054"
    297                x="262.85715"
    298                height="208.57144"
    299                width="107.14282"
    300                id="rect6084-7" /></flowRegion><flowPara
    301              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial"
    302              id="flowPara6086-2">CC Oct 2018</flowPara><flowPara
    303              style="font-size:24px;line-height:1"
    304              id="flowPara6088-8"><flowSpan
    305    id="flowSpan6118-9"
    306    style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara
    307              id="flowPara6090-2" /></flowRoot>        <flowRoot
    308            transform="matrix(0.26458333,0,0,0.26458333,20.323189,-24.166702)"
    309            style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
    310            id="flowRoot6219"
    311            xml:space="preserve"><flowRegion
    312              id="flowRegion6221"><rect
    313                y="185.37683"
    314                x="310"
    315                height="37.142857"
    316                width="54.285713"
    317                id="rect6223" /></flowRegion><flowPara
    318              id="flowPara6225">...</flowPara></flowRoot>        <flowRoot
    319            transform="matrix(0.26458333,0,0,0.26458333,48.314274,-18.592251)"
    320            style="font-style:normal;font-weight:normal;font-size:40px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;filter:url(#filter6144-7-0)"
    321            id="flowRoot6080-6-3"
    322            xml:space="preserve"><flowRegion
    323              style="line-height:1"
    324              id="flowRegion6082-9-7"><rect
    325                style="line-height:1"
    326                y="106.8054"
    327                x="262.85715"
    328                height="208.57144"
    329                width="107.14282"
    330                id="rect6084-7-3" /></flowRegion><flowPara
    331              id="flowPara6281"
    332              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">CC Aug 2019</flowPara><flowPara
    333              style="font-size:24px;line-height:1"
    334              id="flowPara6088-8-3"><flowSpan
    335    id="flowSpan6118-9-9"
    336    style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1;font-family:Arial;-inkscape-font-specification:Arial">columnar index</flowSpan> </flowPara><flowPara
    337              id="flowPara6090-2-8" /></flowRoot>        <rect
    338            style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.15369482;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    339            id="rect5954-3-4"
    340            width="26.947613"
    341            height="46.602379"
    342            x="116.46248"
    343            y="8.7612762" />
     273             id="flowPara6088" /><flowPara
     274             id="flowPara6090" /></flowRoot>        <g
     275           id="g13905">
     276          <rect
     277             y="8.9821434"
     278             x="36.174416"
     279             height="32.630665"
     280             width="26.947613"
     281             id="rect5954"
     282             style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     283          <flowRoot
     284             xml:space="preserve"
     285             id="flowRoot6219"
     286             style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     287             transform="matrix(0.26458333,0,0,0.26458333,20.323189,-24.166702)"><flowRegion
     288               id="flowRegion6221"><rect
     289                 id="rect6223"
     290                 width="54.285713"
     291                 height="37.142857"
     292                 x="310"
     293                 y="185.37683" /></flowRegion><flowPara
     294               id="flowPara6225">...</flowPara></flowRoot>          <flowRoot
     295             transform="matrix(0.26458333,0,0,0.26458333,1.8898848,4.5357143)"
     296             style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     297             id="flowRoot13798"
     298             xml:space="preserve"><flowRegion
     299               id="flowRegion13800"><rect
     300                 y="33.948257"
     301                 x="136.72218"
     302                 height="106.18565"
     303                 width="94.70639"
     304                 id="rect13802" /></flowRegion><flowPara
     305               id="flowPara13804">CC Sep 2018</flowPara><flowPara
     306               id="flowPara13806">Columnar Index</flowPara><flowPara
     307               id="flowPara13808" /></flowRoot>          <rect
     308             y="8.8441896"
     309             x="69.302986"
     310             height="32.630665"
     311             width="26.947613"
     312             id="rect5954-4"
     313             style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     314          <flowRoot
     315             transform="matrix(0.26458333,0,0,0.26458333,35.018452,4.3977563)"
     316             style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     317             id="flowRoot13798-0"
     318             xml:space="preserve"><flowRegion
     319               id="flowRegion13800-2"><rect
     320                 y="33.948257"
     321                 x="136.72218"
     322                 height="106.18565"
     323                 width="94.70639"
     324                 id="rect13802-0" /></flowRegion><flowPara
     325               id="flowPara13804-8">CC Oct 2018</flowPara><flowPara
     326               id="flowPara13806-1">Columnar Index</flowPara><flowPara
     327               id="flowPara13808-6" /></flowRoot>          <rect
     328             y="8.9197874"
     329             x="113.52618"
     330             height="32.630665"
     331             width="26.947613"
     332             id="rect5954-1"
     333             style="opacity:1;fill:#64e000;fill-opacity:0;stroke:#000000;stroke-width:0.12860805;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     334          <flowRoot
     335             transform="matrix(0.26458333,0,0,0.26458333,79.241667,4.4733578)"
     336             style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     337             id="flowRoot13798-09"
     338             xml:space="preserve"><flowRegion
     339               id="flowRegion13800-0"><rect
     340                 y="33.948257"
     341                 x="136.72218"
     342                 height="106.18565"
     343                 width="94.70639"
     344                 id="rect13802-07" /></flowRegion><flowPara
     345               id="flowPara13876">CC Aug 2019</flowPara><flowPara
     346               id="flowPara13806-6">Columnar Index</flowPara><flowPara
     347               id="flowPara13808-1" /></flowRoot>        </g>
    344348      </g>
    345349    </g>
     
    354358           x="154.28572"
    355359           y="568.23395" /></flowRegion><flowPara
    356          id="flowPara5976"></flowPara></flowRoot>    <flowRoot
     360         id="flowPara5976" /></flowRoot>    <flowRoot
    357361       xml:space="preserve"
    358362       id="flowRoot5980"
     
    364368           x="1.4285715"
    365369           y="65.376831" /></flowRegion><flowPara
    366          id="flowPara5986"></flowPara></flowRoot>    <flowRoot
     370         id="flowPara5986" /></flowRoot>    <flowRoot
    367371       xml:space="preserve"
    368372       id="flowRoot6469"
     
    374378           x="90.714287"
    375379           y="612.51971" /></flowRegion><flowPara
    376          id="flowPara6475"></flowPara></flowRoot>    <g
     380         id="flowPara6475" /></flowRoot>    <g
    377381       id="g8700">
    378       <rect
    379          y="81.553574"
    380          x="20.788691"
    381          height="24.190477"
    382          width="51.026787"
    383          id="rect6388"
    384          style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
    385       <rect
    386          y="76.261909"
    387          x="20.788691"
    388          height="5.2916689"
    389          width="15.497024"
    390          id="rect6390"
    391          style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
    392382      <flowRoot
    393383         transform="matrix(0.26458333,0,0,0.26458333,2.6458333,3.4017858)"
     
    401391             width="192.85715"
    402392             id="rect6396" /></flowRegion><flowPara
    403            id="flowPara6400">*.warc.wet files</flowPara><flowPara
    404            id="flowPara6404">CC Sep 2018</flowPara></flowRoot>      <g
    405          transform="translate(-32.883929,-20.197169)"
    406          id="g6579">
     393           id="flowPara6404" /></flowRoot>      <g
     394         id="g14156"
     395         transform="translate(0,-14.81667)">
    407396        <rect
    408            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    409            id="rect6388-2"
     397           style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
     398           id="rect6388"
    410399           width="51.026787"
    411400           height="24.190477"
    412            x="99.974701"
    413            y="103.47619" />
     401           x="20.788691"
     402           y="81.553574" />
    414403        <rect
    415            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    416            id="rect6390-7"
    417            width="15.497025"
    418            height="5.2916694"
    419            x="99.974701"
    420            y="98.184525" />
     404           style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
     405           id="rect6390"
     406           width="15.497024"
     407           height="5.2916689"
     408           x="20.788691"
     409           y="76.261909" />
     410        <g
     411           id="g6579"
     412           transform="translate(-32.883929,-20.197169)">
     413          <rect
     414             y="103.47619"
     415             x="99.974701"
     416             height="24.190477"
     417             width="51.026787"
     418             id="rect6388-2"
     419             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     420          <rect
     421             y="98.184525"
     422             x="99.974701"
     423             height="5.2916694"
     424             width="15.497025"
     425             id="rect6390-7"
     426             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     427        </g>
     428        <g
     429           id="g6630"
     430           transform="translate(61.241527,-46.849824)">
     431          <rect
     432             y="131.19792"
     433             x="51.460232"
     434             height="24.190477"
     435             width="51.026787"
     436             id="rect6388-8-4"
     437             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" />
     438          <rect
     439             y="125.90625"
     440             x="51.460232"
     441             height="5.2916694"
     442             width="15.497025"
     443             id="rect6390-9-4"
     444             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1" />
     445        </g>
     446        <g
     447           id="g6591"
     448           transform="translate(21.43299,-58.018398)">
     449          <rect
     450             y="143.1637"
     451             x="111.31399"
     452             height="24.190477"
     453             width="51.026787"
     454             id="rect6388-8"
     455             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     456          <rect
     457             y="137.87202"
     458             x="111.31399"
     459             height="5.2916694"
     460             width="15.497025"
     461             id="rect6390-9"
     462             style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
     463        </g>
    421464        <flowRoot
    422            xml:space="preserve"
    423            id="flowRoot6392-0"
    424            style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
    425            transform="matrix(0.26458333,0,0,0.26458333,81.831845,25.3244)"><flowRegion
    426              id="flowRegion6394-8"><rect
    427                id="rect6396-9"
    428                width="192.85715"
    429                height="91.428543"
     465           transform="matrix(0.26458333,0,0,0.26458333,4.9136906,4.8380953)"
     466           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     467           id="flowRoot14053"
     468           xml:space="preserve"><flowRegion
     469             id="flowRegion14055"><rect
     470               y="308.23401"
    430471               x="78.571426"
    431                y="308.23401" /></flowRegion><flowPara
    432              id="flowPara6400-8">*.warc.wet files</flowPara><flowPara
    433              id="flowPara6404-1">CC Oct 2018</flowPara></flowRoot>      </g>
    434       <g
    435          transform="translate(61.241527,-46.849824)"
    436          id="g6630">
    437         <rect
    438            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1"
    439            id="rect6388-8-4"
    440            width="51.026787"
    441            height="24.190477"
    442            x="51.460232"
    443            y="131.19792" />
    444         <rect
    445            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:1.06, 1.06;stroke-dashoffset:0;stroke-opacity:1"
    446            id="rect6390-9-4"
    447            width="15.497025"
    448            height="5.2916694"
    449            x="51.460232"
    450            y="125.90625" />
    451       </g>
    452       <g
    453          transform="translate(21.43299,-58.018398)"
    454          id="g6591">
    455         <rect
    456            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    457            id="rect6388-8"
    458            width="51.026787"
    459            height="24.190477"
    460            x="111.31399"
    461            y="143.1637" />
    462         <rect
    463            style="opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26499999;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
    464            id="rect6390-9"
    465            width="15.497025"
    466            height="5.2916694"
    467            x="111.31399"
    468            y="137.87202" />
    469         <text
    470            xml:space="preserve"
    471            style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.3499999px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
    472            x="115.66071"
    473            y="151.85715"
    474            id="text6479"><tspan
    475              sodipodi:role="line"
    476              id="tspan6477"
    477              x="115.66071"
    478              y="151.85715"
    479              style="stroke-width:0.26458332">*.warc.wet files</tspan><tspan
    480              sodipodi:role="line"
    481              x="115.66071"
    482              y="159.79465"
    483              style="stroke-width:0.26458332"
    484              id="tspan6481">CC Aug 2019</tspan></text>
    485       </g>
     472               height="52.235638"
     473               width="142.14285"
     474               id="rect14057" /></flowRegion><flowPara
     475             id="flowPara14059">*.warc.wet files</flowPara><flowPara
     476             id="flowPara14061">CC Sep 2018</flowPara></flowRoot>        <flowRoot
     477           transform="matrix(0.26458333,0,0,0.26458333,50.006029,7.3307912)"
     478           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     479           id="flowRoot14053-6"
     480           xml:space="preserve"><flowRegion
     481             id="flowRegion14055-5"><rect
     482               y="308.23401"
     483               x="78.571426"
     484               height="52.235638"
     485               width="142.14285"
     486               id="rect14057-1" /></flowRegion><flowPara
     487             id="flowPara14059-3">*.warc.wet files</flowPara><flowPara
     488             id="flowPara14061-0">CC Oct 2018</flowPara></flowRoot>        <flowRoot
     489           transform="matrix(0.26458333,0,0,0.26458333,114.78021,8.3683492)"
     490           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     491           id="flowRoot14053-6-6"
     492           xml:space="preserve"><flowRegion
     493             id="flowRegion14055-5-5"><rect
     494               y="308.23401"
     495               x="78.571426"
     496               height="52.235638"
     497               width="142.14285"
     498               id="rect14057-1-8" /></flowRegion><flowPara
     499             id="flowPara14059-3-3">*.warc.wet files</flowPara><flowPara
     500             id="flowPara14061-0-1">CC Aug 2019</flowPara></flowRoot>      </g>
    486501    </g>
    487502    <path
    488503       style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart)"
    489        d="M 91.563371,76.232879 C 91.031116,56.190155 92.909569,56.190155 92.909569,56.190155"
     504       d="M 91.563371,63.532871 C 91.031116,43.490147 92.909569,43.490147 92.909569,43.490147"
    490505       id="path6644"
    491506       inkscape:connector-curvature="0" />
     
    500515         x="169.53333"
    501516         y="66.803123"
    502          style="stroke-width:0.26458332"></tspan><tspan
     517         style="stroke-width:0.26458332" /><tspan
    503518         sodipodi:role="line"
    504519         x="169.53333"
     
    526541           x="372.02237"
    527542           y="209.24846" /></flowRegion><flowPara
    528          id="flowPara8534"></flowPara></flowRoot>    <flowRoot
     543         id="flowPara8534" /></flowRoot>    <flowRoot
    529544       xml:space="preserve"
    530545       id="flowRoot8538"
     
    541556       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
    542557       x="126.03919"
    543        y="63.406425"
     558       y="49.118912"
    544559       id="text8548"><tspan
    545560         sodipodi:role="line"
    546561         x="126.03919"
    547          y="63.406425"
     562         y="49.118912"
    548563         style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
    549564         id="tspan8550">content_languages = 'mri'</tspan><tspan
    550565         sodipodi:role="line"
    551566         x="126.03919"
    552          y="68.345314"
     567         y="54.0578"
    553568         style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
    554569         id="tspan8558">+</tspan><tspan
    555570         sodipodi:role="line"
    556571         x="126.03919"
    557          y="73.284203"
     572         y="58.996689"
    558573         style="line-height:1;text-align:center;text-anchor:middle;stroke-width:0.26458332"
    559574         id="tspan8560">warc to wet</tspan></text>
    560575    <flowRoot
    561        transform="matrix(0.26458333,0,0,0.26458333,38.922917,130.28623)"
     576       transform="matrix(0.26458333,0,0,0.26458333,43.156253,110.17777)"
    562577       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
    563578       id="flowRoot6219-8"
     
    571586         id="flowPara6225-5">...</flowPara></flowRoot>    <g
    572587       id="g8667"
    573        transform="translate(-8.3391904,40.839482)">
     588       transform="translate(-4.1058568,21.78947)">
    574589      <rect
    575590         y="124.0766"
     
    601616    <g
    602617       id="g8709"
    603        transform="translate(4.7141497,49.968462)">
     618       transform="translate(8.9474833,31.976784)">
    604619      <g
    605620         id="g8744">
     
    634649    <g
    635650       id="g8658"
    636        transform="translate(-14.085366,17.548923)">
     651       transform="translate(-9.8520303,-1.5010822)">
    637652      <rect
    638653         y="153.60989"
     
    662677           id="flowPara8572-7">site 00002</flowPara><flowPara
    663678           id="flowPara8574-4">seedURLs + url filters</flowPara></flowRoot>    </g>
    664     <g
    665        id="g8797"
    666        transform="translate(8.0502882,-6.9746679)">
    667       <path
    668          inkscape:connector-curvature="0"
    669          id="path8783"
    670          d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
    671          style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    672       <path
    673          inkscape:connector-curvature="0"
    674          id="path8785"
    675          d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
    676          style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    677       <path
    678          inkscape:connector-curvature="0"
    679          id="path8787"
    680          d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
    681          style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    682       <path
    683          inkscape:connector-curvature="0"
    684          id="path8791"
    685          d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
    686          style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    687     </g>
    688     <g
    689        id="g8854"
    690        transform="translate(89.92074,-0.52379428)">
    691       <path
    692          inkscape:connector-curvature="0"
    693          id="path8783-9"
    694          d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
    695          style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    696       <path
    697          inkscape:connector-curvature="0"
    698          id="path8785-2"
    699          d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0"
    700          style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    701       <path
    702          inkscape:connector-curvature="0"
    703          id="path8787-3"
    704          d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0"
    705          style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    706       <path
    707          inkscape:connector-curvature="0"
    708          id="path8791-1"
    709          d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
    710          style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
    711     </g>
    712     <g
    713        id="g8797-3"
    714        transform="translate(46.537087,0.53576233)"
    715        style="fill:#ffffff;fill-opacity:0.98412697">
    716       <g
    717          id="g8844"
    718          transform="translate(0,2.6458334)"
    719          style="fill:#ffffff;fill-opacity:0.98412697">
    720         <path
    721            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    722            d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
    723            id="path8783-7"
    724            inkscape:connector-curvature="0" />
    725         <path
    726            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    727            d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
    728            id="path8785-5"
    729            inkscape:connector-curvature="0" />
    730         <path
    731            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    732            d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
    733            id="path8787-8"
    734            inkscape:connector-curvature="0" />
    735         <path
    736            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    737            d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
    738            id="path8791-4"
    739            inkscape:connector-curvature="0" />
    740       </g>
    741     </g>
    742679    <path
    743680       style="fill:none;stroke:#000000;stroke-width:0.62406325px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2)"
    744        d="M 81.998023,162.37652 C 81.457172,143.82815 83.36598,143.82815 83.36598,143.82815"
     681       d="M 81.998023,141.73889 C 81.457172,123.19052 83.36598,123.19052 83.36598,123.19052"
    745682       id="path6644-5"
    746683       inkscape:connector-curvature="0" />
    747684    <path
    748685       style="fill:none;stroke:#000000;stroke-width:0.64353597px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8)"
    749        d="m 105.88728,142.56554 c 19.8306,2.95621 19.50441,4.80612 19.50441,4.80612"
     686       d="M 94.763655,112.71836 C 111.855,102.23609 112.79422,103.86287 112.79422,103.86287"
    750687       id="path6644-5-4"
    751688       inkscape:connector-curvature="0" />
    752689    <path
    753690       style="fill:none;stroke:#000000;stroke-width:0.59350747px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9)"
    754        d="m 81.814159,127.36144 c -0.53225,-17.0476 1.3462,-17.0476 1.3462,-17.0476"
     691       d="m 81.814159,111.48634 c -0.53225,-17.047567 1.3462,-17.047567 1.3462,-17.047567"
    755692       id="path6644-5-5"
    756693       inkscape:connector-curvature="0" />
     
    758695       xml:space="preserve"
    759696       id="flowRoot10549"
    760        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
    761        transform="matrix(0.26458333,0,0,0.26458333,0,2.1166667)"><flowRegion
    762          id="flowRegion10551"><rect
     697       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
     698       transform="translate(-4.5959653,-47.786576)"><flowRegion
     699         id="flowRegion10551"
     700         style="stroke-width:0.26458332"><rect
    763701           id="rect10553"
    764            width="291.42856"
    765            height="55.714287"
    766            x="450"
    767            y="555.37683" /></flowRegion><flowPara
    768          id="flowPara10555">blacklist + greylist + whitelist +</flowPara><flowPara
    769          id="flowPara10557">sites needing custom handling</flowPara></flowRoot>    <g
    770        id="g8797-3-1"
    771        transform="translate(61.771447,5.8811623)"
    772        style="fill:#ffffff;fill-opacity:0.98412697">
     702           width="77.10714"
     703           height="14.741072"
     704           x="119.0625"
     705           y="146.94345"
     706           style="stroke-width:0.07000434" /></flowRegion><flowPara
     707         id="flowPara10555"
     708         style="stroke-width:0.26458332">blacklist + greylist + whitelist +</flowPara><flowPara
     709         id="flowPara10557"
     710         style="stroke-width:0.26458332">sites needing custom handling</flowPara></flowRoot>    <g
     711       id="g14180"
     712       transform="translate(-1.6508314,16.447997)">
    773713      <g
    774          id="g8844-6"
    775          transform="translate(0,2.6458334)"
    776          style="fill:#ffffff;fill-opacity:0.98412697">
    777         <path
    778            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     714         transform="translate(8.0502882,-22.849676)"
     715         id="g8797">
     716        <path
     717           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    779718           d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
    780            id="path8783-7-3"
    781            inkscape:connector-curvature="0" />
    782         <path
    783            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     719           id="path8783"
     720           inkscape:connector-curvature="0" />
     721        <path
     722           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    784723           d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
    785            id="path8785-5-7"
    786            inkscape:connector-curvature="0" />
    787         <path
    788            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     724           id="path8785"
     725           inkscape:connector-curvature="0" />
     726        <path
     727           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    789728           d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
    790            id="path8787-8-3"
    791            inkscape:connector-curvature="0" />
    792         <path
    793            style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     729           id="path8787"
     730           inkscape:connector-curvature="0" />
     731        <path
     732           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
    794733           d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
    795            id="path8791-4-2"
    796            inkscape:connector-curvature="0" />
     734           id="path8791"
     735           inkscape:connector-curvature="0" />
     736      </g>
     737      <g
     738         transform="translate(89.92074,-16.398799)"
     739         id="g8854">
     740        <path
     741           style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     742           d="m 45.884371,117.35848 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
     743           id="path8783-9"
     744           inkscape:connector-curvature="0" />
     745        <path
     746           style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     747           d="m 48.022521,122.43661 c 17.10525,0 17.10525,0 17.10525,0"
     748           id="path8785-2"
     749           inkscape:connector-curvature="0" />
     750        <path
     751           style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     752           d="m 46.418911,128.5838 c 17.10525,0 17.10525,0 17.10525,0"
     753           id="path8787-3"
     754           inkscape:connector-curvature="0" />
     755        <path
     756           style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     757           d="m 45.617101,134.19646 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
     758           id="path8791-1"
     759           inkscape:connector-curvature="0" />
     760      </g>
     761      <g
     762         style="fill:#ffffff;fill-opacity:0.98412697"
     763         transform="translate(46.537087,-15.339242)"
     764         id="g8797-3">
     765        <g
     766           style="fill:#ffffff;fill-opacity:0.98412697"
     767           transform="translate(0,2.6458334)"
     768           id="g8844">
     769          <path
     770             inkscape:connector-curvature="0"
     771             id="path8783-7"
     772             d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
     773             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     774          <path
     775             inkscape:connector-curvature="0"
     776             id="path8785-5"
     777             d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
     778             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     779          <path
     780             inkscape:connector-curvature="0"
     781             id="path8787-8"
     782             d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
     783             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     784          <path
     785             inkscape:connector-curvature="0"
     786             id="path8791-4"
     787             d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
     788             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     789        </g>
     790      </g>
     791      <g
     792         style="fill:#ffffff;fill-opacity:0.98412697"
     793         transform="translate(61.771447,-9.9938385)"
     794         id="g8797-3-1">
     795        <g
     796           style="fill:#ffffff;fill-opacity:0.98412697"
     797           transform="translate(0,2.6458334)"
     798           id="g8844-6">
     799          <path
     800             inkscape:connector-curvature="0"
     801             id="path8783-7-3"
     802             d="m 108.51143,118.73122 -6.68174,21.1143 h 25.65787 l 5.87993,-21.1143 z"
     803             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     804          <path
     805             inkscape:connector-curvature="0"
     806             id="path8785-5-7"
     807             d="m 110.64958,123.80935 c 17.10525,0 17.10525,0 17.10525,0"
     808             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     809          <path
     810             inkscape:connector-curvature="0"
     811             id="path8787-8-3"
     812             d="m 109.04597,129.95654 c 17.10525,0 17.10525,0 17.10525,0"
     813             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     814          <path
     815             inkscape:connector-curvature="0"
     816             id="path8791-4-2"
     817             d="m 108.24416,135.5692 c 16.03617,-0.26726 16.03617,-0.26726 16.03617,-0.26726"
     818             style="fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
     819        </g>
    797820      </g>
    798821    </g>
     
    801824       id="flowRoot10757"
    802825       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"
    803        transform="matrix(0.26458333,0,0,0.26458333,5.1165915,-0.90713918)"><flowRegion
     826       transform="matrix(0.26458333,0,0,0.26458333,5.1165915,-20.486313)"><flowRegion
    804827         id="flowRegion10759"
    805828         style="text-align:center;text-anchor:middle"><rect
     
    810833           y="509.66254"
    811834           style="text-align:center;text-anchor:middle" /></flowRegion><flowPara
    812          id="flowPara10763">CCWETProcessor.java</flowPara></flowRoot>    <path
    813        style="fill:none;stroke:#000000;stroke-width:0.58551782px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2)"
    814        d="m 124.20743,213.25747 c -0.49158,-17.96448 1.24333,-17.96448 1.24333,-17.96448"
    815        id="path6644-5-5-7"
    816        inkscape:connector-curvature="0" />
    817     <flowRoot
     835         id="flowPara10763">CCWETProcessor.java</flowPara></flowRoot>    <flowRoot
    818836       xml:space="preserve"
    819837       id="flowRoot11618"
     
    825843           x="255.71428"
    826844           y="806.80542" /></flowRegion><flowPara
    827          id="flowPara11624"></flowPara></flowRoot>    <flowRoot
     845         id="flowPara11624" /></flowRoot>    <flowRoot
    828846       xml:space="preserve"
    829847       id="flowRoot11626"
     
    835853           x="364.28571"
    836854           y="835.37683" /></flowRegion><flowPara
    837          id="flowPara11632"></flowPara></flowRoot>    <flowRoot
     855         id="flowPara11632" /></flowRoot>    <flowRoot
    838856       xml:space="preserve"
    839857       id="flowRoot11636"
    840858       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:24px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
    841        transform="matrix(0.26458333,0,0,0.26458333,41.665157,-19.117346)"><flowRegion
     859       transform="matrix(0.26458333,0,0,0.26458333,41.665157,-34.992356)"><flowRegion
    842860         id="flowRegion11638"><rect
    843861           id="rect11640"
     
    846864           x="340"
    847865           y="826.80542" /></flowRegion><flowPara
    848          id="flowPara11642">Crawl with Apache Nutch</flowPara><flowPara
     866         id="flowPara11642" /><flowPara
    849867         id="flowPara11644" /></flowRoot>    <rect
    850868       style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:1.00157475;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
     
    867885           y="875.03229"
    868886           style="fill:#000000;fill-opacity:0;" /></flowRegion><flowPara
    869          id="flowPara12554"></flowPara></flowRoot>    <flowRoot
     887         id="flowPara12554" /></flowRoot>    <flowRoot
    870888       xml:space="preserve"
    871889       id="flowRoot12556"
     
    877895           x="431.84021"
    878896           y="871.49677" /></flowRegion><flowPara
    879          id="flowPara12562"></flowPara></flowRoot>    <g
     897         id="flowPara12562" /></flowRoot>    <g
    880898       id="g12584"
    881        transform="translate(6.9121608,-2.1711027)">
     899       transform="translate(35.409039,-23.306338)">
    882900      <g
    883901         transform="translate(59.46747,-6.5481034)"
     
    921939           id="flowPara12572" /></flowRoot>    </g>
    922940    <path
    923        style="fill:none;stroke:#000000;stroke-width:0.64151984px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8)"
    924        d="m 96.775998,235.2001 c 20.118612,-0.52693 20.118602,1.33273 20.118602,1.33273"
     941       style="fill:none;stroke:#000000;stroke-width:0.61500657px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2)"
     942       d="m 161.40032,200.57914 c -0.49158,-19.81955 1.24333,-19.81955 1.24333,-19.81955"
     943       id="path6644-5-5-7"
     944       inkscape:connector-curvature="0" />
     945    <path
     946       style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8)"
     947       d="m 110.10792,211.95692 c 30.18095,-0.52693 30.18093,1.33273 30.18093,1.33273"
    925948       id="path6644-5-4-0"
    926949       inkscape:connector-curvature="0" />
    927950    <flowRoot
    928        xml:space="preserve"
    929        id="flowRoot12967"
    930        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"
    931        transform="matrix(0.26458333,0,0,0.26458333,-57.452381,56.318459)"><flowRegion
    932          id="flowRegion12969"
    933          style="text-align:center;text-anchor:middle"><rect
    934            id="rect12971"
    935            width="252.59795"
    936            height="45.616917"
    937            x="297.14285"
    938            y="771.09113"
    939            style="text-align:center;text-anchor:middle" /></flowRegion><flowPara
    940          id="flowPara12973">text dump per crawled site</flowPara></flowRoot>    <flowRoot
    941951       xml:space="preserve"
    942952       id="flowRoot13119"
     
    948958           x="79.814285"
    949959           y="893.01202" /></flowRegion><flowPara
    950          id="flowPara13125"></flowPara></flowRoot>    <g
     960         id="flowPara13125" /></flowRoot>    <g
    951961       id="g13297"
    952        transform="translate(0,1.0583333)">
     962       transform="translate(9.4494046,-25.713581)">
    953963      <g
    954964         transform="translate(-80.712161,97.488904)"
     
    10771087         style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
    10781088    </g>
    1079   </g>
     1089    <g
     1090       id="g12584-4"
     1091       transform="translate(-83.111041,35.25929)">
     1092      <g
     1093         transform="translate(59.46747,-6.5481034)"
     1094         id="g12546-1">
     1095        <ellipse
     1096           style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
     1097           id="path12440-2"
     1098           cx="66.523811"
     1099           cy="232.36606"
     1100           rx="13.985119"
     1101           ry="4.5357141" />
     1102        <path
     1103           style="opacity:1;fill:#ffffff;fill-opacity:0.98412697;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
     1104           d="m 80.473281,254.62108 c 1e-5,2.50501 -6.261304,4.53572 -13.985039,4.53572 -7.723738,0 -13.985059,-2.03071 -13.985049,-4.53572"
     1105           id="path12532-6"
     1106           inkscape:connector-curvature="0"
     1107           sodipodi:nodetypes="csc" />
     1108        <path
     1109           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     1110           d="m 52.538693,232.36606 -0.0355,22.25502"
     1111           id="path12538-3"
     1112           inkscape:connector-curvature="0" />
     1113        <path
     1114           style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
     1115           d="m 80.508929,232.36606 -0.03565,22.25502"
     1116           id="path12540-5"
     1117           inkscape:connector-curvature="0" />
     1118      </g>
     1119      <flowRoot
     1120         transform="matrix(0.26458333,0,0,0.26458333,1.3363477,3.2072344)"
     1121         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     1122         id="flowRoot12564-1"
     1123         xml:space="preserve"><flowRegion
     1124           id="flowRegion12566-0"><rect
     1125             y="876.54755"
     1126             x="431.33514"
     1127             height="57.07362"
     1128             width="86.873116"
     1129             id="rect12568-8" /></flowRegion><flowPara
     1130           id="flowPara12570-9">MongoDB</flowPara><flowPara
     1131           id="flowPara12572-4" /></flowRoot>    </g>
     1132    <text
     1133       xml:space="preserve"
     1134       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888855px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
     1135       x="128.51186"
     1136       y="186.63091"
     1137       id="text14721"><tspan
     1138         sodipodi:role="line"
     1139         id="tspan14719"
     1140         x="128.51186"
     1141         y="186.63091"
     1142         style="stroke-width:0.26458332">Crawl with </tspan><tspan
     1143         sodipodi:role="line"
     1144         x="128.51186"
     1145         y="192.80452"
     1146         style="stroke-width:0.26458332"
     1147         id="tspan15630">Apache Nutch</tspan></text>
     1148    <path
     1149       style="fill:none;stroke:#000000;stroke-width:0.62741137px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-9-2-8)"
     1150       d="m 41.504639,258.88526 c -0.418819,-24.21071 1.059298,-24.21071 1.059298,-24.21071"
     1151       id="path6644-5-5-7-9"
     1152       inkscape:connector-curvature="0" />
     1153    <flowRoot
     1154       xml:space="preserve"
     1155       id="flowRoot15304"
     1156       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     1157       transform="matrix(0.26458333,0,0,0.26458333,-9.0714287,-1.889881)"><flowRegion
     1158         id="flowRegion15306"><rect
     1159           id="rect15308"
     1160           width="147.14287"
     1161           height="54.285732"
     1162           x="444.28571"
     1163           y="833.94824" /></flowRegion><flowPara
     1164         id="flowPara15310">get text dump of</flowPara><flowPara
     1165         id="flowPara15314">each crawled site</flowPara></flowRoot>    <flowRoot
     1166       xml:space="preserve"
     1167       id="flowRoot10757-0"
     1168       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
     1169       transform="matrix(0.26458333,0,0,0.26458333,9.8528943,103.04083)"><flowRegion
     1170         id="flowRegion10759-1"
     1171         style="text-align:start;text-anchor:start"><rect
     1172           id="rect10761-2"
     1173           width="360.54208"
     1174           height="74.783302"
     1175           x="145.71428"
     1176           y="509.66254"
     1177           style="text-align:start;text-anchor:start" /></flowRegion><flowPara
     1178         id="flowPara10763-6">NutchTextDumpToMongoDB.java</flowPara><flowPara
     1179         id="flowPara15752">- compute + store site and page level meta</flowPara><flowPara
     1180         id="flowPara15748">- store full text per web page</flowPara></flowRoot>    <flowRoot
     1181       xml:space="preserve"
     1182       id="flowRoot15734"
     1183       style="fill:black;fill-opacity:1;stroke:none;font-family:Arial;font-style:normal;font-weight:normal;font-size:18.66666667px;line-height:1.25;letter-spacing:0px;word-spacing:0px;-inkscape-font-specification:'Arial, Normal';font-stretch:normal;font-variant:normal;text-anchor:start;text-align:start;writing-mode:lr;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"><flowRegion
     1184         id="flowRegion15736"><rect
     1185           id="rect15738"
     1186           width="184.28572"
     1187           height="52.312145"
     1188           x="345.71429"
     1189           y="944.49323" /></flowRegion><flowPara
     1190         id="flowPara15740" /></flowRoot>    <ellipse
     1191       style="opacity:1;fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
     1192       id="path15809"
     1193       cx="144.57588"
     1194       cy="271.4866"
     1195       rx="41.766369"
     1196       ry="14.930058" />
     1197    <path
     1198       style="fill:none;stroke:#000000;stroke-width:0.78573805px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1;marker-start:url(#Arrow2Lstart-2-8-8-6)"
     1199       d="m 91.922716,272.44387 c -30.180951,-0.52693 -30.18093,1.33273 -30.18093,1.33273"
     1200       id="path6644-5-4-0-2"
     1201       inkscape:connector-curvature="0" />
     1202    <flowRoot
     1203       xml:space="preserve"
     1204       id="flowRoot16118"
     1205       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:18.66666603px;line-height:1.25;font-family:Arial;-inkscape-font-specification:'Arial, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none"
     1206       transform="scale(0.26458333)"><flowRegion
     1207         id="flowRegion16120"
     1208         style="text-align:center;text-anchor:middle"><rect
     1209           id="rect16122"
     1210           width="222.85715"
     1211           height="55.714287"
     1212           x="434.28571"
     1213           y="996.80542"
     1214           style="text-align:center;text-anchor:middle" /></flowRegion><flowPara
     1215         id="flowPara16124">Filter</flowPara><flowPara
     1216         id="flowPara16126">with MongoDB queries</flowPara></flowRoot>  </g>
    10801217</svg>
Note: See TracChangeset for help on using the changeset viewer.