source: main/trunk/model-sites-dev/commoncrawl/siteConfig.xml@ 36214

Last change on this file since 36214 was 34132, checked in by ak19, 4 years ago

Committing the commoncrawl site of Nutch recrawls of our CC data where content-language = MRI. 1. Contains the collection configuration files, but also the keep-urls *.txt files in the etc folder, used by NutchTextDumpPlugin to filter URLs of interest. 2. The import_nutchDumpTxtsOfcrawledMRICC.tar.gz file needs to decompressed into any of the collections that need to be rebuilt. This contains just the Nutch dump.txt files (in their siteID folders) as I've removed the binary files. 3. The script moveDumpTxtFilesIntoImport.sh can be used to generate such cut down versions of the Nutch crawled folders that contain only the dump.txt files within their siteID folders. 4. In the next commit, I'll try to add svn externals to get the import_nutchDumpTxtsOfcrawledMRICC.tar.gz from sitelevel into the collection folders for the 2 current collections in this site.

File size: 9.2 KB
Line 
1<siteConfig xmlns:gsf='http://www.greenstone.org/greenstone3/schema/ConfigFormat' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
2 <metadataList xmlns:gsf="http://www.greenstone.org/greenstone3/schema/MetadataFormat" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
3 <metadata name="siteAdmin">[email protected]</metadata>
4 <!--<metadata name="siteURL">http://www.greenstone.org/greenstone3/</metadata>-->
5 </metadataList>
6 <displayItemList>
7 <displayItem name="siteName" lang="en">DL of CommonCrawl MRI recrawls with Nutch</displayItem>
8 <displayItem name="siteName" key="siteName" dictionary="siteConfig"/>
9 <displayItem name="siteDescription" lang="en">Collections of Nutch crawls of CommonCrawl results where content language was MRI.</displayItem>
10 <displayItem name="siteDescription" key="siteDescription" dictionary="siteConfig"/>
11 </displayItemList>
12 <siteList/>
13 <format>
14 <paramDefault name="favouritebasket" value="on"/>
15 </format>
16 <serviceRackList>
17 <!--Provides the Authentication service -->
18 <serviceRack name='Authentication'>
19 <!-- Go to https://www.google.com/recaptcha/admin and replace these testing keys when you deploy your site -->
20 <recaptcha name="site_key" value="6LeIxAcTAAAAAJcZVRqyHh71UMIEGNQ_MXjiZKhI"/>
21 <recaptcha name="secret_key" value="6LeIxAcTAAAAAGG-vFI1TnRWxMZNFuojJ4WifJWe"/>
22 <!-- this determines which operations to add recaptcha to. Choose from Register, AddUser, EditUser, AccountSettings -->
23 <recaptcha name="operations" value="Register,AddUser"/>
24 </serviceRack>
25 <serviceRack name="ArchiveIO"/>
26 <serviceRack name="CollectionGroups"/>
27 <serviceRack name="DocumentMaker"/>
28 <serviceRack name="DocXMLUtil"/>
29 <serviceRack name="DocumentBasket"/>
30 <serviceRack name="BerryBasket"/>
31 <serviceRack name="GS2Construct"/>
32 <serviceRack name="DebugService"/>
33 <serviceRack name="UserTracker"/>
34 <serviceRack name="CrossCollectionSearch">
35 <format>
36 <!-- this format is used for the search results -->
37 <gsf:template match="documentNode">
38 <xsl:variable name="collname" select="@collection"/>
39 <td><gsf:if-metadata-exists><gsf:metadata name="srclinkFile"/><gsf:if><a><xsl:attribute name='href'>sites/commoncrawl/collect/<xsl:value-of select='@collection'/>/index/assoc/<gsf:metadata name="assocfilepath"/>/<gsf:metadata name="srclinkFile"/></xsl:attribute><gsf:choose-metadata><gsf:metadata name="thumbicon"/><gsf:metadata name="srcicon"/></gsf:choose-metadata></a></gsf:if></gsf:if-metadata-exists></td>
40 <td><a><xsl:attribute name='href'><xsl:value-of select="$library_name"/>/collection/<xsl:value-of select='@collection'/>/document/<xsl:value-of select='@nodeID'/><xsl:if test="$opt-doc-link-args">?<xsl:value-of select="$opt-doc-link-args"/></xsl:if></xsl:attribute><gsf:icon/></a></td>
41 <td><gsf:metadata name='Title'/></td>
42 <td><a href='{$library_name}/collection/{$collname}/page/about'><xsl:value-of select="/page/pageResponse/service/paramList/param[@name='collection']/option[@name=$collname]/displayItem[@name='name']"/> </a></td>
43 </gsf:template>
44 </format>
45 <!-- uncomment to provide the GATE tagging service -->
46 <!--<serviceRack name="GATEServices"/>-->
47 </serviceRack>
48 </serviceRackList>
49 <serviceClusterList/>
50 <!-- this is a global replace list for all collections. At this stage it only applies to greenstone 2 collections. It will attempt to resolve most of the common macros that appear in metadata. You can add more items here if you want them to apply to all collections. To turn this on/off for a particular collection, add <replaceListRef id='xx'/> into the collectionConfig for each set that you want to use -->
51 <!-- replace elements can look like:
52 <replace macro="original text" scope="text|metadata|all" text="replacement text"/>
53 <replace macro="original text" scope="text|metadata|all" metadata="metadata element value to replace with"/>
54 <replace macro="original text" scope="text|metadata|all" key="dictionary key" bundle="dictionary name"/> -->
55 <replaceList id="gs2-standard">
56 <replace macro="_iconpdf_" scope="metadata" text="&lt;img src='interfaces/default/images/ipdf.gif' border='0'/&gt;" resolve="false"/>
57 <replace macro="_icondoc_" scope="metadata" text="&lt;img src='interfaces/default/images/imsword.gif' border='0'/&gt;" resolve="false"/>
58 <replace macro="_icondocx_" scope="metadata" text="&lt;img src='interfaces/default/images/imsword.gif' border='0'/&gt;" resolve="false"/>
59 <replace macro="_iconps_" scope="metadata" text="&lt;img src='interfaces/default/images/ips.gif' border='0'/&gt;" resolve="false"/>
60 <replace macro="_iconrtf_" scope="metadata" text="&lt;img src='interfaces/default/images/irtf.gif' border='0'/&gt;" resolve="false"/>
61 <replace macro="_iconxls_" scope="metadata" text="&lt;img src='interfaces/default/images/iexcel.gif' border='0'/&gt;" resolve="false"/>
62 <replace macro="_iconppt_" scope="metadata" text="&lt;img src='interfaces/default/images/ippt.gif' border='0'/&gt;" resolve="false"/>
63 <replace macro="_icontext_" scope="metadata" text="&lt;img src='interfaces/default/images/itext.gif' border='0'/&gt;" resolve="false"/>
64 <replace macro="_iconblanktext_" scope="metadata" text="&lt;img src='interfaces/default/images/ibtext.gif' border='0'/&gt;" resolve="false"/>
65 <replace macro="_iconmp3_" scope="metadata" text="&lt;img src='interfaces/default/images/imp3.gif' border='0'/&gt;" resolve="false"/>
66 <replace macro="_iconmidi_" scope="metadata" text="&lt;img src='interfaces/default/images/imidi.gif' border='0'/&gt;" resolve="false"/>
67 <replace macro="_iconworld_" scope="metadata" text="&lt;img src='interfaces/default/images/iworld.gif' border='0'/&gt;" resolve="false"/>
68 <replace macro="_iconunknown_" scope="metadata" text="&lt;img src='interfaces/default/images/iunknown.gif' border='0'/&gt;" resolve="false"/>
69
70 <replace macro="[archivedir]" scope="all" metadata="assocfilepath" resolve="false"/>
71 <replace macro="[assocfilepath]" scope="all" metadata="assocfilepath" resolve="false"/>
72 <replace macro="[srcurl]" scope="metadata" metadata="srcurl" resolve="false"/>
73 <replace macro="[SourceFile]" scope="metadata" metadata="SourceFile" resolve="false"/>
74 <!-- _httpsite_ and _clustername_ are macros that are defined in the java code -->
75 <replace macro="_httpcollection_" scope="metadata" text="_httpsite_/collect/_clustername_" resolve="true"/>
76 <replace macro="_httpprefix_" scope="metadata" text="_httpsite_" resolve="true"/>
77 <replace macro="_httpdocimg_" scope="text" text="_httpsite_/collect/_clustername_/index/assoc/[assocfilepath]/" resolve="true"/>
78 <replace macro="_httpsamepagelink_" scope="text" text="_libraryname_/collection/_clustername_/document/[DocOID]" resolve="true"/>
79 <replace macro="[collection]" scope="all" text="_clustername_" resolve="true"/>
80 <replace macro="_httpextlink_" scope="all" text="_libraryname_?el=&amp;a=d&amp;c=_clustername_&amp;d=" resolve="true"/>
81 </replaceList>
82 <replaceList id="gs2-image">
83 <replace macro="[Image]" scope="metadata" metadata="Image" resolve="false"/>
84 <replace macro="[Thumb]" scope="metadata" metadata="Thumb" resolve="false"/>
85 <replace macro="[ThumbWidth]" scope="metadata" metadata="ThumbWidth" resolve="false"/>
86 <replace macro="[ThumbHeight]" scope="metadata" metadata="ThumbHeight" resolve="false"/>
87 <replace macro="[Screen]" scope="metadata" metadata="Screen" resolve="false"/>
88 <replace macro="[ScreenWidth]" scope="metadata" metadata="ScreenWidth" resolve="false"/>
89 <replace macro="[ScreenHeight]" scope="metadata" metadata="ScreenHeight" resolve="false"/>
90 </replaceList>
91 <replaceList id="gs2-months">
92 <replace macro="_textmonth00_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth00"/>
93 <replace macro="_textmonth01_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth01"/>
94 <replace macro="_textmonth02_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth02"/>
95 <replace macro="_textmonth03_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth03"/>
96 <replace macro="_textmonth04_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth04"/>
97 <replace macro="_textmonth05_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth05"/>
98 <replace macro="_textmonth06_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth06"/>
99 <replace macro="_textmonth07_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth07"/>
100 <replace macro="_textmonth08_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth08"/>
101 <replace macro="_textmonth09_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth09"/>
102 <replace macro="_textmonth10_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth10"/>
103 <replace macro="_textmonth11_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth11"/>
104 <replace macro="_textmonth12_" scope="metadata" resolve="false" bundle="interface_default" key="textmonth12"/>
105 <replace bundle="interface_default" key="num" macro="_num_" resolve="false" scope="metadata"/>
106 </replaceList>
107</siteConfig>
Note: See TracBrowser for help on using the repository browser.