source: gs3-extensions/fedora/trunk/src/fedoragsearch-files/foxmlToLucene.xslt@ 26432

Last change on this file since 26432 was 26432, checked in by ak19, 8 years ago

Committing before renaming: references to port, server name and fedora password are replaced by placeholder strings. In the next commit these files will become placeholder files that build.xml will generate the active files from.

File size: 11.2 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<!-- $Id: foxmlToLucene.xslt $ -->
3<xsl:stylesheet version="1.0"
4 xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
5 xmlns:exts="xalan://dk.defxws.fedoragsearch.server.GenericOperationsImpl"
6 exclude-result-prefixes="exts"
7 xmlns:foxml="info:fedora/fedora-system:def/foxml#"
8 xmlns:dtu_meta="http://www.dtu.dk/dtu_meta/"
9 xmlns:meta="http://www.dtu.dk/dtu_meta/meta/"
10 xmlns:dc="http://purl.org/dc/elements/1.1/"
11 xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
12 xmlns:ex="http://www.greenstone.org/namespace/fake/ex"
13 xmlns:dls="http://www.greenstone.org/namespace/fake/dls">
14 <xsl:output method="xml" indent="yes" encoding="UTF-8"/>
15
16 <xsl:param name="REPOSITORYNAME" select="'FgsRepos'"/>
17 <xsl:param name="REPOSBASEURL" select="'http://@tomcatserver@:@tomcatport@/fedora'"/>
18 <xsl:param name="FEDORASOAP" select="'http://@tomcatserver@:@tomcatport@/fedora/services'"/>
19 <xsl:param name="FEDORAUSER" select="'fedoraAdmin'"/>
20 <xsl:param name="FEDORAPASS" select="'@fedorapassw@'"/>
21 <xsl:param name="TRUSTSTOREPATH" select="'trustStorePath'"/>
22 <xsl:param name="TRUSTSTOREPASS" select="'trustStorePass'"/>
23<!--
24 This xslt stylesheet generates the IndexDocument consisting of IndexFields
25 from a FOXML record. The IndexFields are:
26 - from the root element = PID
27 - from foxml:property = type, state, contentModel, ...
28 - from oai_dc:dc = title, creator, ...
29 The IndexDocument element gets a PID attribute, which is mandatory,
30 while the PID IndexField is optional.
31 Options for tailoring:
32 - IndexField types, see Lucene javadoc for Field.Store, Field.Index, Field.TermVector
33 - IndexField boosts, see Lucene documentation for explanation
34 - IndexDocument boosts, see Lucene documentation for explanation
35 - generation of IndexFields from other XML metadata streams than DC
36 - e.g. as for uvalibdesc included above and called below, the XML is inline
37 - for not inline XML, the datastream may be fetched with the document() function,
38 see the example below (however, none of the demo objects can test this)
39 - generation of IndexFields from other datastream types than XML
40 - from datastream by ID, text fetched, if mimetype can be handled
41 - from datastream by sequence of mimetypes,
42 text fetched from the first mimetype that can be handled,
43 default sequence given in properties.
44-->
45
46 <xsl:variable name="PID" select="/foxml:digitalObject/@PID"/>
47 <xsl:variable name="docBoost" select="1.4*2.5"/> <!-- or any other calculation, default boost is 1.0 -->
48
49 <xsl:template match="/">
50 <IndexDocument>
51 <!-- The PID attribute is mandatory for indexing to work -->
52 <xsl:attribute name="PID">
53 <xsl:value-of select="$PID"/>
54 </xsl:attribute>
55 <xsl:attribute name="boost"> <!-- example of setting a boost -->
56 <xsl:value-of select="$docBoost"/>
57 </xsl:attribute>
58 <!-- The following allows only active FedoraObjects to be indexed. -->
59 <xsl:if test="foxml:digitalObject/foxml:objectProperties/foxml:property[@NAME='info:fedora/fedora-system:def/model#state' and @VALUE='Active']">
60 <xsl:if test="not(foxml:digitalObject/foxml:datastream[@ID='METHODMAP'] or foxml:digitalObject/foxml:datastream[@ID='DS-COMPOSITE-MODEL'])">
61 <xsl:if test="starts-with($PID,'')">
62 <xsl:apply-templates mode="activeFedoraObject"/>
63 </xsl:if>
64 </xsl:if>
65 </xsl:if>
66 </IndexDocument>
67 </xsl:template>
68
69 <xsl:template match="/foxml:digitalObject" mode="activeFedoraObject">
70 <!-- The PID index field lets you search on the PID value -->
71 <IndexField IFname="PID" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
72 <xsl:value-of select="$PID"/>
73 </IndexField>
74 <IndexField IFname="REPOSITORYNAME" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
75 <xsl:value-of select="$REPOSITORYNAME"/>
76 </IndexField>
77 <IndexField IFname="REPOSBASEURL" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
78 <xsl:value-of select="substring($FEDORASOAP, 1, string-length($FEDORASOAP)-9)"/>
79 </IndexField>
80
81 <!-- indexing foxml property fields -->
82
83 <xsl:for-each select="foxml:objectProperties/foxml:property">
84 <IndexField index="UN_TOKENIZED" store="YES" termVector="NO">
85 <xsl:attribute name="IFname">
86 <xsl:value-of select="concat('fgs.', substring-after(@NAME,'#'))"/>
87 </xsl:attribute>
88 <xsl:value-of select="@VALUE"/>
89 </IndexField>
90 </xsl:for-each>
91
92 <!-- indexing inline dc fields -->
93
94 <xsl:for-each select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/*">
95 <IndexField index="TOKENIZED" store="YES" termVector="YES">
96 <xsl:attribute name="IFname">
97 <xsl:value-of select="concat('dc.', substring-after(name(),':'))"/>
98 </xsl:attribute>
99 <xsl:value-of select="text()"/>
100 </IndexField>
101 </xsl:for-each>
102
103 <!-- indexing dc fields for sorting (must be UN_TOKENIZED) -->
104
105 <IndexField IFname="TITLE_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
106 <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:title"/>
107 </IndexField>
108 <IndexField IFname="AUTHOR_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
109 <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:creator"/>
110 </IndexField>
111
112 <!-- a datastream is fetched, if its mimetype
113 can be handled, the text becomes the value of the field.
114 This is the version using PDFBox,
115 below is the new version using Apache Tika. -->
116 <xsl:for-each select="foxml:datastream[starts-with(@ID,'EX')]/foxml:datastreamVersion[last()]/foxml:xmlContent/ex:ex/ex:metadata">
117 <IndexField index="TOKENIZED" store="YES" termVector="YES">
118 <xsl:attribute name="IFname">
119 <xsl:value-of select="concat('ex.', @name)"/>
120 </xsl:attribute>
121 <xsl:value-of select="text()"/>
122 </IndexField>
123 </xsl:for-each>
124
125 <xsl:for-each select="foxml:datastream[starts-with(@ID,'DLS')]/foxml:datastreamVersion[last()]/foxml:xmlContent/dls:dls/dls:metadata">
126 <IndexField index="TOKENIZED" store="YES" termVector="YES">
127 <xsl:attribute name="IFname">
128 <xsl:value-of select="concat('dls.', @name)"/>
129 </xsl:attribute>
130 <xsl:value-of select="text()"/>
131 </IndexField>
132 </xsl:for-each>
133
134 <!--
135 <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
136 <IndexField index="TOKENIZED" store="YES" termVector="NO">
137 <xsl:attribute name="IFname">
138 <xsl:value-of select="concat('ds.', @ID)"/>
139 </xsl:attribute>
140 <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
141 </IndexField>
142 </xsl:for-each>
143 -->
144
145 <!-- Text and metadata extraction using Apache Tika.
146 Parameters for getDatastreamFromTika, getDatastreamTextFromTika, and getDatastreamMetadataFromTika:
147 - indexFieldTagName : either "IndexField" (with the Lucene plugin) or "field" (with the Solr plugin)
148 - textIndexField : fieldSpec for the text index field, null or empty if not to be generated (not used with getDatastreamMetadataFromTika)
149 - indexfieldnamePrefix : optional or empty, prefixed to the metadata indexfield names (not used with getDatastreamTextFromTika)
150 - selectedFields : comma-separated list of metadata fieldSpecs, if empty then all fields are included with default params (not used with getDatastreamTextFromTika)
151 - fieldSpec : metadataFieldName ['=' indexFieldName] ['/' [index] ['/' [store] ['/' [termVector] ['/' [boost]]]]]
152 metadataFieldName must be exactly as extracted by Tika from the document.
153 You may see the available names if you log in debug mode,
154 look for "METADATA name=" under "fullDsId=" in the log, when "getFromTika" was called during updateIndex
155 indexFieldName is used as the generated index field name,
156 if not given, GSearch uses metadataFieldName after replacement of the characters ' ', ':', '/', '=', '(', ')' with '_'
157 the following parameters are used with Lucene (with Solr these values are specified in schema.xml)
158 index : ['TOKENIZED'|'UN_TOKENIZED'] # first alternative is default
159 store : ['YES'|'NO'] # first alternative is default
160 termVector : ['YES'|'NO'] # first alternative is default
161 boost : <decimal number> # '1.0' is default
162 -->
163 <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
164 <xsl:value-of disable-output-escaping="yes" select="exts:getDatastreamFromTika($PID, $REPOSITORYNAME, @ID, 'IndexField', concat('ds.', @ID), concat('dsmd_', @ID, '.'), '', $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
165 </xsl:for-each>
166
167 <!-- example of a dissemination identified in bDefPid, methodName, parameters, asOfDateTime is fetched,
168 if its mimetype can be handled, the text becomes the value of the IndexField.
169 parameters format is 'name=value name2=value2'-->
170 <!--
171 <IndexField IFname="fgs.Diss.text" index="TOKENIZED" store="YES" termVector="NO"
172 bDefPid="demo:19" methodName="getPDF" parameters="" asOfDateTime="" >
173 </IndexField>
174 -->
175
176 <!-- for not inline XML, the datastream may be fetched with the document() function -->
177 <!--
178 <xsl:call-template name="example-of-xml-not-inline"/>
179 -->
180
181 <!-- This is an example of calling an extension function, see Apache Xalan, may be used for filters.
182 <IndexField IFname="fgs.DS" index="TOKENIZED" store="YES" termVector="NO">
183 <xsl:value-of select="exts:someMethod($PID)"/>
184 </IndexField>
185 -->
186
187 <!--
188 creating an index field with all text from the foxml record and its datastreams
189 -->
190
191 <IndexField IFname="foxml.all.text" index="TOKENIZED" store="YES" termVector="YES">
192 <xsl:for-each select="//text()">
193 <xsl:value-of select="."/>
194 </xsl:for-each>
195 <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
196 <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
197 </xsl:for-each>
198 </IndexField>
199
200 <IndexField IFname="ds.fulltext" index="TOKENIZED" store="YES" termVector="YES">
201 <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
202 <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
203 <xsl:text> </xsl:text>
204 </xsl:for-each>
205 </IndexField>
206
207 </xsl:template>
208
209
210 <xsl:template name="example-of-xml-not-inline">
211
212 <!-- due to Simon Lamb and Steve Bayliss -->
213 <!-- using the test object test:fgs23 -->
214 <!-- namespaces to be included in the stylesheet element -->
215
216 <xsl:variable name="testMapplXml" select="document('http://localhost:8080/fedora/objects/test:fgs23/datastreams/testMapplXml/content')"/>
217
218 <IndexField IFname="testMapplXml.meta.title">
219 <xsl:value-of select="$testMapplXml//meta:title"/>
220 </IndexField>
221
222 </xsl:template>
223
224</xsl:stylesheet>
Note: See TracBrowser for help on using the repository browser.