1 | <?xml version="1.0"?>
|
---|
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
---|
3 | <!--
|
---|
4 | Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
5 | contributor license agreements. See the NOTICE file distributed with
|
---|
6 | this work for additional information regarding copyright ownership.
|
---|
7 | The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
8 | (the "License"); you may not use this file except in compliance with
|
---|
9 | the License. You may obtain a copy of the License at
|
---|
10 |
|
---|
11 | http://www.apache.org/licenses/LICENSE-2.0
|
---|
12 |
|
---|
13 | Unless required by applicable law or agreed to in writing, software
|
---|
14 | distributed under the License is distributed on an "AS IS" BASIS,
|
---|
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
16 | See the License for the specific language governing permissions and
|
---|
17 | limitations under the License.
|
---|
18 | -->
|
---|
19 |
|
---|
20 | <!-- Put site-specific property overrides in this file. -->
|
---|
21 |
|
---|
22 | <configuration>
|
---|
23 |
|
---|
24 | <property>
|
---|
25 | <name>http.agent.name</name>
|
---|
26 | <value>GreenstoneTeam</value>
|
---|
27 | </property>
|
---|
28 |
|
---|
29 | <property>
|
---|
30 | <name>storage.data.store.class</name>
|
---|
31 | <value>org.apache.gora.hbase.store.HBaseStore</value>
|
---|
32 | <description>Default class for storing data</description>
|
---|
33 | </property>
|
---|
34 | <!-- GS NOTE:
|
---|
35 | Refer to https://lucene.472066.n3.nabble.com/protocol-http-or-protocol-httpclient-td4262473.html
|
---|
36 | Plugin protocol-http now supports https, ssl too. Whatever problems may still exist with protocol-httpclient (reco
|
---|
37 | mmended below for https, back when it had more problems than at present) also exist in protocol-http.
|
---|
38 | In short, there's no need to turn on protocol-httpclient for https, as protocol-http works (equally well) for http
|
---|
39 | s pages too.
|
---|
40 | -->
|
---|
41 | <property>
|
---|
42 | <name>plugin.includes</name>
|
---|
43 | <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|url
|
---|
44 | normalizer-(pass|regex|basic)</value>
|
---|
45 | <description>Regular expression naming plugin directory names to
|
---|
46 | include. Any plugin not matching this expression is excluded.
|
---|
47 | In any case you need at least include the nutch-extensionpoints plugin. By
|
---|
48 | default Nutch includes crawling just HTML and plain text via HTTP,
|
---|
49 | and basic indexing and search plugins. In order to use HTTPS please enable
|
---|
50 | protocol-httpclient, but be aware of possible intermittent problems with the
|
---|
51 | underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
|
---|
52 | </description>
|
---|
53 | </property>
|
---|
54 |
|
---|
55 |
|
---|
56 | <!-- https://lucene.472066.n3.nabble.com/Content-of-size-X-was-truncated-to-Y-td4003517.html -->
|
---|
57 | <!--
|
---|
58 | <property>
|
---|
59 | <name>parser.skip.truncated</name>
|
---|
60 | <value>false</value>
|
---|
61 | <description>Boolean value for whether we should skip parsing for truncated documents. By default this
|
---|
62 | property is activated due to extremely high levels of CPU which parsing can sometimes take.
|
---|
63 | </description>
|
---|
64 | </property>
|
---|
65 | -->
|
---|
66 |
|
---|
67 | <property>
|
---|
68 | <name>http.content.limit</name>
|
---|
69 | <value>-1</value>
|
---|
70 | <description>The length limit for downloaded content using the http://
|
---|
71 | protocol, in bytes. If this value is nonnegative (>=0), content longer
|
---|
72 | than it will be truncated; otherwise, no truncation at all. Do not
|
---|
73 | confuse this setting with the file.content.limit setting.
|
---|
74 | </description>
|
---|
75 | </property>
|
---|
76 |
|
---|
77 | <!--https://stackoverflow.com/questions/4871972/how-to-speed-up-crawling-in-nutch-->
|
---|
78 | <!--
|
---|
79 | <property>
|
---|
80 | <name>fetcher.threads.per.queue</name>
|
---|
81 | <value>1</value>
|
---|
82 | <description>This number is the maximum number of threads that
|
---|
83 | should be allowed to access a queue at one time. Setting it to
|
---|
84 | a value > 1 will cause the Crawl-Delay value from robots.txt to
|
---|
85 | be ignored and the value of fetcher.server.min.delay to be used
|
---|
86 | as a delay between successive requests to the same server instead
|
---|
87 | of fetcher.server.delay.
|
---|
88 | </description>
|
---|
89 | </property>
|
---|
90 | -->
|
---|
91 | <property>
|
---|
92 | <name>generate.max.count</name>
|
---|
93 | <value>50</value>
|
---|
94 | <description>The maximum number of urls in a single
|
---|
95 | fetchlist. -1 if unlimited. The urls are counted according
|
---|
96 | to the value of the parameter generator.count.mode.
|
---|
97 | </description>
|
---|
98 | </property>
|
---|
99 | <property>
|
---|
100 | <name>fetcher.throughput.threshold.pages</name>
|
---|
101 | <value>1</value>
|
---|
102 | <description>The threshold of minimum pages per second. If the fetcher downloads less
|
---|
103 | pages per second than the configured threshold, the fetcher stops, preventing slow queue's
|
---|
104 | from stalling the throughput. This threshold must be an integer. This can be useful when
|
---|
105 | fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
|
---|
106 | </description>
|
---|
107 | </property>
|
---|
108 | <property>
|
---|
109 | <name>fetcher.server.delay</name>
|
---|
110 | <value>0.5</value>
|
---|
111 | <description>The number of seconds the fetcher will delay between
|
---|
112 | successive requests to the same server. Note that this might get
|
---|
113 | overriden by a Crawl-Delay from a robots.txt and is used ONLY if
|
---|
114 | fetcher.threads.per.queue is set to 1.
|
---|
115 | </description>
|
---|
116 | </property>
|
---|
117 |
|
---|
118 |
|
---|
119 | </configuration>
|
---|