[26375] | 1 | <?xml version="1.0" encoding="UTF-8"?>
|
---|
| 2 | <!-- $Id: foxmlToLucene.xslt $ -->
|
---|
| 3 | <xsl:stylesheet version="1.0"
|
---|
| 4 | xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
---|
| 5 | xmlns:exts="xalan://dk.defxws.fedoragsearch.server.GenericOperationsImpl"
|
---|
| 6 | exclude-result-prefixes="exts"
|
---|
| 7 | xmlns:foxml="info:fedora/fedora-system:def/foxml#"
|
---|
| 8 | xmlns:dtu_meta="http://www.dtu.dk/dtu_meta/"
|
---|
| 9 | xmlns:meta="http://www.dtu.dk/dtu_meta/meta/"
|
---|
| 10 | xmlns:dc="http://purl.org/dc/elements/1.1/"
|
---|
| 11 | xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
---|
| 12 | xmlns:ex="http://www.greenstone.org/namespace/fake/ex"
|
---|
| 13 | xmlns:dls="http://www.greenstone.org/namespace/fake/dls">
|
---|
| 14 | <xsl:output method="xml" indent="yes" encoding="UTF-8"/>
|
---|
| 15 |
|
---|
| 16 | <xsl:param name="REPOSITORYNAME" select="'FgsRepos'"/>
|
---|
[26432] | 17 | <xsl:param name="REPOSBASEURL" select="'http://@tomcatserver@:@tomcatport@/fedora'"/>
|
---|
| 18 | <xsl:param name="FEDORASOAP" select="'http://@tomcatserver@:@tomcatport@/fedora/services'"/>
|
---|
[26375] | 19 | <xsl:param name="FEDORAUSER" select="'fedoraAdmin'"/>
|
---|
[26432] | 20 | <xsl:param name="FEDORAPASS" select="'@fedorapassw@'"/>
|
---|
[26375] | 21 | <xsl:param name="TRUSTSTOREPATH" select="'trustStorePath'"/>
|
---|
| 22 | <xsl:param name="TRUSTSTOREPASS" select="'trustStorePass'"/>
|
---|
| 23 | <!--
|
---|
| 24 | This xslt stylesheet generates the IndexDocument consisting of IndexFields
|
---|
| 25 | from a FOXML record. The IndexFields are:
|
---|
| 26 | - from the root element = PID
|
---|
| 27 | - from foxml:property = type, state, contentModel, ...
|
---|
| 28 | - from oai_dc:dc = title, creator, ...
|
---|
| 29 | The IndexDocument element gets a PID attribute, which is mandatory,
|
---|
| 30 | while the PID IndexField is optional.
|
---|
| 31 | Options for tailoring:
|
---|
| 32 | - IndexField types, see Lucene javadoc for Field.Store, Field.Index, Field.TermVector
|
---|
| 33 | - IndexField boosts, see Lucene documentation for explanation
|
---|
| 34 | - IndexDocument boosts, see Lucene documentation for explanation
|
---|
| 35 | - generation of IndexFields from other XML metadata streams than DC
|
---|
| 36 | - e.g. as for uvalibdesc included above and called below, the XML is inline
|
---|
| 37 | - for not inline XML, the datastream may be fetched with the document() function,
|
---|
| 38 | see the example below (however, none of the demo objects can test this)
|
---|
| 39 | - generation of IndexFields from other datastream types than XML
|
---|
| 40 | - from datastream by ID, text fetched, if mimetype can be handled
|
---|
| 41 | - from datastream by sequence of mimetypes,
|
---|
| 42 | text fetched from the first mimetype that can be handled,
|
---|
| 43 | default sequence given in properties.
|
---|
| 44 | -->
|
---|
| 45 |
|
---|
| 46 | <xsl:variable name="PID" select="/foxml:digitalObject/@PID"/>
|
---|
| 47 | <xsl:variable name="docBoost" select="1.4*2.5"/> <!-- or any other calculation, default boost is 1.0 -->
|
---|
| 48 |
|
---|
| 49 | <xsl:template match="/">
|
---|
| 50 | <IndexDocument>
|
---|
| 51 | <!-- The PID attribute is mandatory for indexing to work -->
|
---|
| 52 | <xsl:attribute name="PID">
|
---|
| 53 | <xsl:value-of select="$PID"/>
|
---|
| 54 | </xsl:attribute>
|
---|
| 55 | <xsl:attribute name="boost"> <!-- example of setting a boost -->
|
---|
| 56 | <xsl:value-of select="$docBoost"/>
|
---|
| 57 | </xsl:attribute>
|
---|
| 58 | <!-- The following allows only active FedoraObjects to be indexed. -->
|
---|
| 59 | <xsl:if test="foxml:digitalObject/foxml:objectProperties/foxml:property[@NAME='info:fedora/fedora-system:def/model#state' and @VALUE='Active']">
|
---|
| 60 | <xsl:if test="not(foxml:digitalObject/foxml:datastream[@ID='METHODMAP'] or foxml:digitalObject/foxml:datastream[@ID='DS-COMPOSITE-MODEL'])">
|
---|
| 61 | <xsl:if test="starts-with($PID,'')">
|
---|
| 62 | <xsl:apply-templates mode="activeFedoraObject"/>
|
---|
| 63 | </xsl:if>
|
---|
| 64 | </xsl:if>
|
---|
| 65 | </xsl:if>
|
---|
| 66 | </IndexDocument>
|
---|
| 67 | </xsl:template>
|
---|
| 68 |
|
---|
| 69 | <xsl:template match="/foxml:digitalObject" mode="activeFedoraObject">
|
---|
| 70 | <!-- The PID index field lets you search on the PID value -->
|
---|
| 71 | <IndexField IFname="PID" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
| 72 | <xsl:value-of select="$PID"/>
|
---|
| 73 | </IndexField>
|
---|
| 74 | <IndexField IFname="REPOSITORYNAME" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
| 75 | <xsl:value-of select="$REPOSITORYNAME"/>
|
---|
| 76 | </IndexField>
|
---|
| 77 | <IndexField IFname="REPOSBASEURL" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
| 78 | <xsl:value-of select="substring($FEDORASOAP, 1, string-length($FEDORASOAP)-9)"/>
|
---|
| 79 | </IndexField>
|
---|
| 80 |
|
---|
| 81 | <!-- indexing foxml property fields -->
|
---|
| 82 |
|
---|
| 83 | <xsl:for-each select="foxml:objectProperties/foxml:property">
|
---|
| 84 | <IndexField index="UN_TOKENIZED" store="YES" termVector="NO">
|
---|
| 85 | <xsl:attribute name="IFname">
|
---|
| 86 | <xsl:value-of select="concat('fgs.', substring-after(@NAME,'#'))"/>
|
---|
| 87 | </xsl:attribute>
|
---|
| 88 | <xsl:value-of select="@VALUE"/>
|
---|
| 89 | </IndexField>
|
---|
| 90 | </xsl:for-each>
|
---|
| 91 |
|
---|
| 92 | <!-- indexing inline dc fields -->
|
---|
| 93 |
|
---|
| 94 | <xsl:for-each select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/*">
|
---|
| 95 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
| 96 | <xsl:attribute name="IFname">
|
---|
| 97 | <xsl:value-of select="concat('dc.', substring-after(name(),':'))"/>
|
---|
| 98 | </xsl:attribute>
|
---|
| 99 | <xsl:value-of select="text()"/>
|
---|
| 100 | </IndexField>
|
---|
| 101 | </xsl:for-each>
|
---|
| 102 |
|
---|
| 103 | <!-- indexing dc fields for sorting (must be UN_TOKENIZED) -->
|
---|
| 104 |
|
---|
| 105 | <IndexField IFname="TITLE_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
| 106 | <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:title"/>
|
---|
| 107 | </IndexField>
|
---|
| 108 | <IndexField IFname="AUTHOR_UNTOK" index="UN_TOKENIZED" store="YES" termVector="NO" boost="1.0">
|
---|
| 109 | <xsl:value-of select="foxml:datastream/foxml:datastreamVersion[last()]/foxml:xmlContent/oai_dc:dc/dc:creator"/>
|
---|
| 110 | </IndexField>
|
---|
| 111 |
|
---|
| 112 | <!-- a datastream is fetched, if its mimetype
|
---|
| 113 | can be handled, the text becomes the value of the field.
|
---|
| 114 | This is the version using PDFBox,
|
---|
| 115 | below is the new version using Apache Tika. -->
|
---|
| 116 | <xsl:for-each select="foxml:datastream[starts-with(@ID,'EX')]/foxml:datastreamVersion[last()]/foxml:xmlContent/ex:ex/ex:metadata">
|
---|
| 117 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
| 118 | <xsl:attribute name="IFname">
|
---|
| 119 | <xsl:value-of select="concat('ex.', @name)"/>
|
---|
| 120 | </xsl:attribute>
|
---|
| 121 | <xsl:value-of select="text()"/>
|
---|
| 122 | </IndexField>
|
---|
| 123 | </xsl:for-each>
|
---|
| 124 |
|
---|
| 125 | <xsl:for-each select="foxml:datastream[starts-with(@ID,'DLS')]/foxml:datastreamVersion[last()]/foxml:xmlContent/dls:dls/dls:metadata">
|
---|
| 126 | <IndexField index="TOKENIZED" store="YES" termVector="YES">
|
---|
| 127 | <xsl:attribute name="IFname">
|
---|
| 128 | <xsl:value-of select="concat('dls.', @name)"/>
|
---|
| 129 | </xsl:attribute>
|
---|
| 130 | <xsl:value-of select="text()"/>
|
---|
| 131 | </IndexField>
|
---|
| 132 | </xsl:for-each>
|
---|
| 133 |
|
---|
| 134 | <!--
|
---|
| 135 | <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
| 136 | <IndexField index="TOKENIZED" store="YES" termVector="NO">
|
---|
| 137 | <xsl:attribute name="IFname">
|
---|
| 138 | <xsl:value-of select="concat('ds.', @ID)"/>
|
---|
| 139 | </xsl:attribute>
|
---|
| 140 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
| 141 | </IndexField>
|
---|
| 142 | </xsl:for-each>
|
---|
| 143 | -->
|
---|
| 144 |
|
---|
| 145 | <!-- Text and metadata extraction using Apache Tika.
|
---|
| 146 | Parameters for getDatastreamFromTika, getDatastreamTextFromTika, and getDatastreamMetadataFromTika:
|
---|
| 147 | - indexFieldTagName : either "IndexField" (with the Lucene plugin) or "field" (with the Solr plugin)
|
---|
| 148 | - textIndexField : fieldSpec for the text index field, null or empty if not to be generated (not used with getDatastreamMetadataFromTika)
|
---|
| 149 | - indexfieldnamePrefix : optional or empty, prefixed to the metadata indexfield names (not used with getDatastreamTextFromTika)
|
---|
| 150 | - selectedFields : comma-separated list of metadata fieldSpecs, if empty then all fields are included with default params (not used with getDatastreamTextFromTika)
|
---|
| 151 | - fieldSpec : metadataFieldName ['=' indexFieldName] ['/' [index] ['/' [store] ['/' [termVector] ['/' [boost]]]]]
|
---|
| 152 | metadataFieldName must be exactly as extracted by Tika from the document.
|
---|
| 153 | You may see the available names if you log in debug mode,
|
---|
| 154 | look for "METADATA name=" under "fullDsId=" in the log, when "getFromTika" was called during updateIndex
|
---|
| 155 | indexFieldName is used as the generated index field name,
|
---|
| 156 | if not given, GSearch uses metadataFieldName after replacement of the characters ' ', ':', '/', '=', '(', ')' with '_'
|
---|
| 157 | the following parameters are used with Lucene (with Solr these values are specified in schema.xml)
|
---|
| 158 | index : ['TOKENIZED'|'UN_TOKENIZED'] # first alternative is default
|
---|
| 159 | store : ['YES'|'NO'] # first alternative is default
|
---|
| 160 | termVector : ['YES'|'NO'] # first alternative is default
|
---|
| 161 | boost : <decimal number> # '1.0' is default
|
---|
| 162 | -->
|
---|
| 163 | <xsl:for-each select="foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
| 164 | <xsl:value-of disable-output-escaping="yes" select="exts:getDatastreamFromTika($PID, $REPOSITORYNAME, @ID, 'IndexField', concat('ds.', @ID), concat('dsmd_', @ID, '.'), '', $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
| 165 | </xsl:for-each>
|
---|
| 166 |
|
---|
| 167 | <!-- example of a dissemination identified in bDefPid, methodName, parameters, asOfDateTime is fetched,
|
---|
| 168 | if its mimetype can be handled, the text becomes the value of the IndexField.
|
---|
| 169 | parameters format is 'name=value name2=value2'-->
|
---|
| 170 | <!--
|
---|
| 171 | <IndexField IFname="fgs.Diss.text" index="TOKENIZED" store="YES" termVector="NO"
|
---|
| 172 | bDefPid="demo:19" methodName="getPDF" parameters="" asOfDateTime="" >
|
---|
| 173 | </IndexField>
|
---|
| 174 | -->
|
---|
| 175 |
|
---|
| 176 | <!-- for not inline XML, the datastream may be fetched with the document() function -->
|
---|
| 177 | <!--
|
---|
| 178 | <xsl:call-template name="example-of-xml-not-inline"/>
|
---|
| 179 | -->
|
---|
| 180 |
|
---|
| 181 | <!-- This is an example of calling an extension function, see Apache Xalan, may be used for filters.
|
---|
| 182 | <IndexField IFname="fgs.DS" index="TOKENIZED" store="YES" termVector="NO">
|
---|
| 183 | <xsl:value-of select="exts:someMethod($PID)"/>
|
---|
| 184 | </IndexField>
|
---|
| 185 | -->
|
---|
| 186 |
|
---|
| 187 | <!--
|
---|
| 188 | creating an index field with all text from the foxml record and its datastreams
|
---|
| 189 | -->
|
---|
| 190 |
|
---|
| 191 | <IndexField IFname="foxml.all.text" index="TOKENIZED" store="YES" termVector="YES">
|
---|
| 192 | <xsl:for-each select="//text()">
|
---|
| 193 | <xsl:value-of select="."/>
|
---|
| 194 | </xsl:for-each>
|
---|
| 195 | <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
| 196 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
| 197 | </xsl:for-each>
|
---|
| 198 | </IndexField>
|
---|
| 199 |
|
---|
| 200 | <IndexField IFname="ds.fulltext" index="TOKENIZED" store="YES" termVector="YES">
|
---|
| 201 | <xsl:for-each select="//foxml:datastream[@CONTROL_GROUP='M' or @CONTROL_GROUP='E' or @CONTROL_GROUP='R']">
|
---|
| 202 | <xsl:value-of select="exts:getDatastreamText($PID, $REPOSITORYNAME, @ID, $FEDORASOAP, $FEDORAUSER, $FEDORAPASS, $TRUSTSTOREPATH, $TRUSTSTOREPASS)"/>
|
---|
| 203 | <xsl:text>Â </xsl:text>
|
---|
| 204 | </xsl:for-each>
|
---|
| 205 | </IndexField>
|
---|
| 206 |
|
---|
| 207 | </xsl:template>
|
---|
| 208 |
|
---|
| 209 |
|
---|
| 210 | <xsl:template name="example-of-xml-not-inline">
|
---|
| 211 |
|
---|
| 212 | <!-- due to Simon Lamb and Steve Bayliss -->
|
---|
| 213 | <!-- using the test object test:fgs23 -->
|
---|
| 214 | <!-- namespaces to be included in the stylesheet element -->
|
---|
| 215 |
|
---|
| 216 | <xsl:variable name="testMapplXml" select="document('http://localhost:8080/fedora/objects/test:fgs23/datastreams/testMapplXml/content')"/>
|
---|
| 217 |
|
---|
| 218 | <IndexField IFname="testMapplXml.meta.title">
|
---|
| 219 | <xsl:value-of select="$testMapplXml//meta:title"/>
|
---|
| 220 | </IndexField>
|
---|
| 221 |
|
---|
| 222 | </xsl:template>
|
---|
| 223 |
|
---|
| 224 | </xsl:stylesheet>
|
---|