source: gs3-extensions/maori-lang-detection/hdfs-cc-work/conf/nutch-site.xml@ 33596

Last change on this file since 33596 was 33596, checked in by ak19, 5 years ago

Adding in the nutch-site.xml and regex-urlfilter.GS_TEMPLATE template file that need to go into apache-nutch-2.3.1/nutch when setting this up for crawls

File size: 5.0 KB
Line 
1<?xml version="1.0"?>
2<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3<!--
4 Licensed to the Apache Software Foundation (ASF) under one or more
5 contributor license agreements. See the NOTICE file distributed with
6 this work for additional information regarding copyright ownership.
7 The ASF licenses this file to You under the Apache License, Version 2.0
8 (the "License"); you may not use this file except in compliance with
9 the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18-->
19
20<!-- Put site-specific property overrides in this file. -->
21
22<configuration>
23
24 <property>
25 <name>http.agent.name</name>
26 <value>GreenstoneTeam</value>
27 </property>
28
29 <property>
30 <name>storage.data.store.class</name>
31 <value>org.apache.gora.hbase.store.HBaseStore</value>
32 <description>Default class for storing data</description>
33 </property>
34 <!-- GS NOTE:
35 Refer to https://lucene.472066.n3.nabble.com/protocol-http-or-protocol-httpclient-td4262473.html
36 Plugin protocol-http now supports https, ssl too. Whatever problems may still exist with protocol-httpclient (reco
37 mmended below for https, back when it had more problems than at present) also exist in protocol-http.
38 In short, there's no need to turn on protocol-httpclient for https, as protocol-http works (equally well) for http
39 s pages too.
40 -->
41 <property>
42 <name>plugin.includes</name>
43 <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|url
44 normalizer-(pass|regex|basic)</value>
45 <description>Regular expression naming plugin directory names to
46 include. Any plugin not matching this expression is excluded.
47 In any case you need at least include the nutch-extensionpoints plugin. By
48 default Nutch includes crawling just HTML and plain text via HTTP,
49 and basic indexing and search plugins. In order to use HTTPS please enable
50 protocol-httpclient, but be aware of possible intermittent problems with the
51 underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
52 </description>
53 </property>
54
55
56 <!-- https://lucene.472066.n3.nabble.com/Content-of-size-X-was-truncated-to-Y-td4003517.html -->
57 <!--
58 <property>
59 <name>parser.skip.truncated</name>
60 <value>false</value>
61 <description>Boolean value for whether we should skip parsing for truncated documents. By default this
62 property is activated due to extremely high levels of CPU which parsing can sometimes take.
63 </description>
64 </property>
65 -->
66
67 <property>
68 <name>http.content.limit</name>
69 <value>-1</value>
70 <description>The length limit for downloaded content using the http://
71 protocol, in bytes. If this value is nonnegative (>=0), content longer
72 than it will be truncated; otherwise, no truncation at all. Do not
73 confuse this setting with the file.content.limit setting.
74 </description>
75 </property>
76
77 <!--https://stackoverflow.com/questions/4871972/how-to-speed-up-crawling-in-nutch-->
78 <!--
79 <property>
80 <name>fetcher.threads.per.queue</name>
81 <value>1</value>
82 <description>This number is the maximum number of threads that
83 should be allowed to access a queue at one time. Setting it to
84 a value > 1 will cause the Crawl-Delay value from robots.txt to
85 be ignored and the value of fetcher.server.min.delay to be used
86 as a delay between successive requests to the same server instead
87 of fetcher.server.delay.
88 </description>
89 </property>
90 -->
91 <property>
92 <name>generate.max.count</name>
93 <value>50</value>
94 <description>The maximum number of urls in a single
95 fetchlist. -1 if unlimited. The urls are counted according
96 to the value of the parameter generator.count.mode.
97 </description>
98 </property>
99 <property>
100 <name>fetcher.throughput.threshold.pages</name>
101 <value>1</value>
102 <description>The threshold of minimum pages per second. If the fetcher downloads less
103 pages per second than the configured threshold, the fetcher stops, preventing slow queue's
104 from stalling the throughput. This threshold must be an integer. This can be useful when
105 fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
106 </description>
107 </property>
108 <property>
109 <name>fetcher.server.delay</name>
110 <value>0.5</value>
111 <description>The number of seconds the fetcher will delay between
112 successive requests to the same server. Note that this might get
113 overriden by a Crawl-Delay from a robots.txt and is used ONLY if
114 fetcher.threads.per.queue is set to 1.
115 </description>
116 </property>
117
118
119</configuration>
Note: See TracBrowser for help on using the repository browser.