1 | <?xml version="1.0" encoding="UTF-8"?>
|
---|
2 | <!-- $Id: foxmlToLucene.xslt $ -->
|
---|
3 | <xsl:stylesheet version="1.0"
|
---|
4 | xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
---|
5 | xmlns:exts="xalan://dk.defxws.fedoragsearch.server.GenericOperationsImpl"
|
---|
6 | exclude-result-prefixes="exts"
|
---|
7 | xmlns:foxml="info:fedora/fedora-system:def/foxml#"
|
---|
8 | xmlns:dtu_meta="http://www.dtu.dk/dtu_meta/"
|
---|
9 | xmlns:meta="http://www.dtu.dk/dtu_meta/meta/"
|
---|
10 | xmlns:dc="http://purl.org/dc/elements/1.1/"
|
---|
11 | xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
---|
12 | xmlns:ex="http://www.greenstone.org/namespace/fake/ex"
|
---|
13 | xmlns:dls="http://www.greenstone.org/namespace/fake/dls">
|
---|
14 | <xsl:output method="xml" indent="yes" encoding="UTF-8"/>
|
---|
15 |
|
---|
16 | <xsl:param name="REPOSITORYNAME" select="'FgsRepos'"/>
|
---|
17 | <xsl:param name="REPOSBASEURL" select="'http://localhost:8080/fedora'"/>
|
---|
18 | <xsl:param name="FEDORASOAP" select="'http://localhost:8080/fedora/services'"/>
|
---|
19 | <xsl:param name="FEDORAUSER" select="'fedoraAdmin'"/>
|
---|
20 | <xsl:param name="FEDORAPASS" select="'fedoraAdmin'"/>
|
---|
21 | <xsl:param name="TRUSTSTOREPATH" select="'trustStorePath'"/>
|
---|
22 | <xsl:param name="TRUSTSTOREPASS" select="'trustStorePass'"/>
|
---|
23 | <!--
|
---|
24 | This xslt stylesheet generates the IndexDocument consisting of IndexFields
|
---|
25 | from a FOXML record. The IndexFields are:
|
---|
26 | - from the root element = PID
|
---|
27 | - from foxml:property = type, state, contentModel, ...
|
---|
28 | - from oai_dc:dc = title, creator, ...
|
---|
29 | The IndexDocument element gets a PID attribute, which is mandatory,
|
---|
30 | while the PID IndexField is optional.
|
---|
31 | Options for tailoring:
|
---|
32 | - IndexField types, see Lucene javadoc for Field.Store, Field.Index, Field.TermVector
|
---|
33 | - IndexField boosts, see Lucene documentation for explanation
|
---|
34 | - IndexDocument boosts, see Lucene documentation for explanation
|
---|
35 | - generation of IndexFields from other XML metadata streams than DC
|
---|
36 | - e.g. as for uvalibdesc included above and called below, the XML is inline
|
---|
37 | - for not inline XML, the datastream may be fetched with the document() function,
|
---|
38 | see the example below (however, none of the demo objects can test this)
|
---|
39 | - generation of IndexFields from other datastream types than XML
|
---|
40 | - from datastream by ID, text fetched, if mimetype can be handled
|
---|
41 | - from datastream by sequence of mimetypes,
|
---|
42 | text fetched from the first mimetype that can be handled,
|
---|
43 | default sequence given in properties.
|
---|
44 | -->
|
---|
45 |
|
---|
46 | <xsl:variable name="PID" select="/foxml:digitalObject/@PID"/>
|
---|
47 | <xsl:variable name="docBoost" select="1.4*2.5"/> <!-- or any other calculation, default boost is 1.0 -->
|
---|
48 |
|
---|
49 | <xsl:template match="/">
|
---|
50 | <IndexDocument>
|
---|
51 | <!-- The PID attribute is mandatory for indexing to work -->
|
---|
52 | <xsl:attribute name="PID">
|
---|
53 | <xsl:value-of select="$PID"/>
|
---|
54 | </xsl:attribute>
|
---|
55 | <xsl:attribute name="boost"> <!-- example of setting a boost -->
|
---|
56 | <xsl:value-of select="$docBoost"/>
|
---|
57 | </xsl:attribute>
|
---|
58 | <!-- The following allows only active FedoraObjects to be indexed. -->
|
---|
59 | <xsl:if test="foxml:digitalObject/foxml:objectProperties/foxml:property[@NAME='info:fedora/fedora-system:def/model#state' and @VALUE='Active']">
|
---|
60 | <xsl:if test="not(foxml:digitalObject/foxml:datastream[@ID='METHODMAP'] or foxml:digitalObject/foxml:datastream[@ID='DS-COMPOSITE-MODEL'])">
|
---|
61 | <xsl:if test="starts-with($PID,'')">
|
---|
62 | <xsl:apply-templates mode="activeFedoraObject"/>
|
---|
63 | </xsl:if>
|
---|
64 | </xsl:if>
|
---|
65 | </xsl:if>
|
---|
66 | </IndexDocument>
|
---|
67 | </xsl:template>
|
---|
68 |
|
---|
69 | <xsl:template match="/foxml:digitalObject" mode="activeFedoraObject">
|
---|
70 | <!-- The PID index field lets you search on the PID value -->
|
---|
71 | <IndexField IFname="PID" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
72 | <xsl:value-of select="$PID"/>
|
---|
73 | </IndexField>
|
---|
74 | <IndexField IFname="REPOSITORYNAME" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
75 | <xsl:value-of select="$REPOSITORYNAME"/>
|
---|
76 | </IndexField>
|
---|
77 | <IndexField IFname="REPOSBASEURL" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
78 | <xsl:value-of select="substring($FEDORASOAP, 1, string-length($FEDORASOAP)-9)"/>
|
---|
79 | </IndexField>
|
---|
80 |
|
---|
81 | <!-- indexing foxml property fields -->
|
---|
82 |
|
---|
83 | <xsl:for-each select="foxml:objectProperties/foxml:property">
|
---|
84 | <IndexField index="UN_TOKENIZED" store="YES" termVector="NO">
|
---|
85 | <xsl:attribute name="IFname">
|
---|
86 | <xsl:value-of select="concat('fgs.', substring-after(@NAME,'#'))"/>
|
---|
87 | </xsl:attribute>
|
---|
88 | <xsl:value-of select="@VALUE"/>
|
---|
89 | </IndexField>
|
---|
90 | </xsl:for-each>
|
---|
91 |
|
---|
92 | <!-- indexing inline dc fields -->
|
---|
93 |
|
---|
94 | <xsl:for-each select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/*">
|
---|
95 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
96 | <xsl:attribute name="IFname">
|
---|
97 | <xsl:value-of select="concat('dc.', substring-after(name(),':'))"/>
|
---|
98 | </xsl:attribute>
|
---|
99 | <xsl:value-of select="text()"/>
|
---|
100 | </IndexField>
|
---|
101 | </xsl:for-each>
|
---|
102 |
|
---|
103 | <!-- indexing dc fields for sorting (must be UN_TOKENIZED) -->
|
---|
104 |
|
---|
105 | <IndexField IFname="TITLE_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
106 | <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:title"/>
|
---|
107 | </IndexField>
|
---|
108 | <IndexField IFname="AUTHOR_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
109 | <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:creator"/>
|
---|
110 | </IndexField>
|
---|
111 |
|
---|
112 | <!-- a datastream is fetched, if its mimetype
|
---|
113 | can be handled, the text becomes the value of the field.
|
---|
114 | This is the version using PDFBox,
|
---|
115 | below is the new version using Apache Tika. -->
|
---|
116 | <xsl:for-each select="foxml:datastream[starts-with(@ID,'EX')]/foxml:datastreamVersion[last()]/foxml:xmlContent/ex:ex/ex:metadata">
|
---|
117 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
118 | <xsl:attribute name="IFname">
|
---|
119 | <xsl:value-of select="concat('ex.', @name)"/>
|
---|
120 | </xsl:attribute>
|
---|
121 | <xsl:value-of select="text()"/>
|
---|
122 | </IndexField>
|
---|
123 | </xsl:for-each>
|
---|
124 |
|
---|
125 | <xsl:for-each select="foxml:datastream[starts-with(@ID,'DLS')]/foxml:datastreamVersion[last()]/foxml:xmlContent/dls:dls/dls:metadata">
|
---|
126 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
127 | <xsl:attribute name="IFname">
|
---|
128 | <xsl:value-of select="concat('dls.', @name)"/>
|
---|
129 | </xsl:attribute>
|
---|
130 | <xsl:value-of select="text()"/>
|
---|
131 | </IndexField>
|
---|
132 | </xsl:for-each>
|
---|
133 |
|
---|
134 | <!--
|
---|
135 | <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
136 | <IndexField index="TOKENIZED" store="YES" termVector="NO">
|
---|
137 | <xsl:attribute name="IFname">
|
---|
138 | <xsl:value-of select="concat('ds.', @ID)"/>
|
---|
139 | </xsl:attribute>
|
---|
140 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
141 | </IndexField>
|
---|
142 | </xsl:for-each>
|
---|
143 | -->
|
---|
144 |
|
---|
145 | <!-- Text and metadata extraction using Apache Tika.
|
---|
146 | Parameters for getDatastreamFromTika, getDatastreamTextFromTika, and getDatastreamMetadataFromTika:
|
---|
147 | - indexFieldTagName : either "IndexField" (with the Lucene plugin) or "field" (with the Solr plugin)
|
---|
148 | - textIndexField : fieldSpec for the text index field, null or empty if not to be generated (not used with getDatastreamMetadataFromTika)
|
---|
149 | - indexfieldnamePrefix : optional or empty, prefixed to the metadata indexfield names (not used with getDatastreamTextFromTika)
|
---|
150 | - selectedFields : comma-separated list of metadata fieldSpecs, if empty then all fields are included with default params (not used with getDatastreamTextFromTika)
|
---|
151 | - fieldSpec : metadataFieldName ['=' indexFieldName] ['/' [index] ['/' [store] ['/' [termVector] ['/' [boost]]]]]
|
---|
152 | metadataFieldName must be exactly as extracted by Tika from the document.
|
---|
153 | You may see the available names if you log in debug mode,
|
---|
154 | look for "METADATA name=" under "fullDsId=" in the log, when "getFromTika" was called during updateIndex
|
---|
155 | indexFieldName is used as the generated index field name,
|
---|
156 | if not given, GSearch uses metadataFieldName after replacement of the characters ' ', ':', '/', '=', '(', ')' with '_'
|
---|
157 | the following parameters are used with Lucene (with Solr these values are specified in schema.xml)
|
---|
158 | index : ['TOKENIZED'|'UN_TOKENIZED'] # first alternative is default
|
---|
159 | store : ['YES'|'NO'] # first alternative is default
|
---|
160 | termVector : ['YES'|'NO'] # first alternative is default
|
---|
161 | boost : <decimal number> # '1.0' is default
|
---|
162 | -->
|
---|
163 | <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
164 | <xsl:value-of disable-output-escaping="yes" select="exts:getDatastreamFromTika($PID, $REPOSITORYNAME, @ID, 'IndexField', concat('ds.', @ID), concat('dsmd_', @ID, '.'), '', $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
165 | </xsl:for-each>
|
---|
166 |
|
---|
167 | <!-- example of a dissemination identified in bDefPid, methodName, parameters, asOfDateTime is fetched,
|
---|
168 | if its mimetype can be handled, the text becomes the value of the IndexField.
|
---|
169 | parameters format is 'name=value name2=value2'-->
|
---|
170 | <!--
|
---|
171 | <IndexField IFname="fgs.Diss.text" index="TOKENIZED" store="YES" termVector="NO"
|
---|
172 | bDefPid="demo:19" methodName="getPDF" parameters="" asOfDateTime="" >
|
---|
173 | </IndexField>
|
---|
174 | -->
|
---|
175 |
|
---|
176 | <!-- for not inline XML, the datastream may be fetched with the document() function -->
|
---|
177 | <!--
|
---|
178 | <xsl:call-template name="example-of-xml-not-inline"/>
|
---|
179 | -->
|
---|
180 |
|
---|
181 | <!-- This is an example of calling an extension function, see Apache Xalan, may be used for filters.
|
---|
182 | <IndexField IFname="fgs.DS" index="TOKENIZED" store="YES" termVector="NO">
|
---|
183 | <xsl:value-of select="exts:someMethod($PID)"/>
|
---|
184 | </IndexField>
|
---|
185 | -->
|
---|
186 |
|
---|
187 | <!--
|
---|
188 | creating an index field with all text from the foxml record and its datastreams
|
---|
189 | -->
|
---|
190 |
|
---|
191 | <IndexField IFname="foxml.all.text" index="TOKENIZED" store="YES" termVector="YES">
|
---|
192 | <xsl:for-each select="//text()">
|
---|
193 | <xsl:value-of select="."/>
|
---|
194 | </xsl:for-each>
|
---|
195 | <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
196 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
197 | </xsl:for-each>
|
---|
198 | </IndexField>
|
---|
199 |
|
---|
200 | <IndexField IFname="ds.fulltext" index="TOKENIZED" store="YES" termVector="YES">
|
---|
201 | <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
202 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
203 | <xsl:text>Â </xsl:text>
|
---|
204 | </xsl:for-each>
|
---|
205 | </IndexField>
|
---|
206 |
|
---|
207 | </xsl:template>
|
---|
208 |
|
---|
209 |
|
---|
210 | <xsl:template name="example-of-xml-not-inline">
|
---|
211 |
|
---|
212 | <!-- due to Simon Lamb and Steve Bayliss -->
|
---|
213 | <!-- using the test object test:fgs23 -->
|
---|
214 | <!-- namespaces to be included in the stylesheet element -->
|
---|
215 |
|
---|
216 | <xsl:variable name="testMapplXml" select="document('http://localhost:8080/fedora/objects/test:fgs23/datastreams/testMapplXml/content')"/>
|
---|
217 |
|
---|
218 | <IndexField IFname="testMapplXml.meta.title">
|
---|
219 | <xsl:value-of select="$testMapplXml//meta:title"/>
|
---|
220 | </IndexField>
|
---|
221 |
|
---|
222 | </xsl:template>
|
---|
223 |
|
---|
224 | </xsl:stylesheet>
|
---|