Changeset 37227 for gs3-installations


Ignore:
Timestamp:
2023-01-31T22:24:55+13:00 (15 months ago)
Author:
davidb
Message:

Updated collection

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/intermuse/trunk/sites/intermuse/collect/programmes-and-performers/etc/collectionConfig.xml

    r37166 r37227  
    1212    </metadataList>
    1313    <displayItemList>
    14         <displayItem lang="en" name="name">Programmes and Performers</displayItem>
     14        <displayItem lang="en" name="name">HMS Programmes and Performers</displayItem>
    1515        <displayItem lang="en" name="text">text</displayItem>
    1616        <displayItem lang="en" name="ex.im.Work,dc.Title,ex.dc.Title,Title">titles</displayItem>
     
    2222        <displayItem lang="en" name="section">section</displayItem>
    2323        <!--
    24         <displayItem lang="en" name="shortDescription">Programmes from the IntermusE corpus enhanced at ingest time using the Google Vision API</displayItem>
     24        <displayItem lang="en" name="shortDescription">Programmes from the InterMusE corpus enhanced at ingest time using the Google Vision API</displayItem>
    2525        -->
    26         <displayItem lang="en" name="shortDescription">OCR'd programmes from the IntermusE corpus enriched with metadata about the performances and performers</displayItem>
     26        <displayItem lang="en" name="shortDescription">OCR'd programmes from the InterMusE corpus enriched with metadata about the performances and performers</displayItem>
    2727    </displayItemList>
    2828    <format>
     
    106106              -->
    107107              <xsl:template name="coll-description">
    108                 <p>
     108                <div style="float: right; width: 300px;">
     109                  <img style="width: 100%;" src="interfaces/{$interface_name}/images/intermuse-title-logo.png" />
     110                </div>
     111
     112                <p style="padding-top: 0.7rem;">
    109113                 
    110114                  Live musical events play a vital role in community life across the
     
    112116                  traces on the historical record, even in modern times. While
    113117                  musicologists have used some types of concert ephemera to capture the
    114                   nature and identity of musical events, by there very nature these
     118                  nature and identity of musical events, by their very nature these
    115119                  resources can be confusingly inconsistent, tantalisingly incomplete,
    116120                  and often scattered between different archives and collections.
    117121                </p>
    118 
    119                 <p>
    120                   This (prototype) InterMusE Digital Library is a resource developed as the result
     122                <!--
     123                <p>
     124                  This <i>prototype</i> InterMusE Digital Library is a resource developed as the result
    121125                  of a two-year project, funded by AHRC’s UK-US New Directions for Digital
    122126                  Scholarship in Cultural Institutions programme, that seeks
    123127                  to better capture and represent these historical events.
    124                  
     128
    125129                  Using natural-language processing, optical character recognition (OCR),
    126130                  and other forms of artificial intelligence, this digital library
    127131                  brings together an array of digitised resources sourced from:
     132                  </p>
     133                -->
     134               
     135                <p>
     136                  This <a href="https://intermuse.datatodata.org/" target="_blank">InterMusE Project</a>
     137                  is a two-year research endeavour,
     138                  funded by AHRC’s UK-US New Directions for Digital
     139                  Scholarship in Cultural Institutions programme, that seeks
     140                  to better capture and represent these historical events,               
     141                  leveraging natural-language processing, optical character recognition (OCR),
     142                  and other forms of artificial intelligence.
     143                  To illustrate the potential of the approach we work with digitised resources
     144                  sourced from:
     145                </p>
     146               
    128147                  <ul>
    129148                <li>
     
    140159                </li>
    141160                  </ul>
    142                   Material is also sourced from three former chapters of the British Music Society (est. 1918):
     161                  <p>Material is also sourced from three former chapters of the British Music Society (est. 1918):</p>
    143162                  <ul>
    144163                <li>
     
    149168                </li>
    150169                <li>
    151                   <a href="https://www.belfastmusicsociety.org/">Belfast Music Society</a>.
     170                  <a href="https://www.belfastmusicsociety.org/" target="_blank" rel="noreferrer noopener">Belfast Music Society</a>.
    152171                </li>
    153172                  </ul>
    154                 </p>
    155 
    156                 <p>
    157                   Unifying these resources ... Linked Open Data ....
    158 
     173
     174
     175               
     176                <h3>Prototype Digital Library</h3>
     177
     178                <p>
     179                  Greenstone3 is an open source digital library system with a
     180                  versatile service-based software architecture, managed through an extension
     181                  mechanism.                 
     182                  Taking the Huddersfield Music Society Programmes as the
     183                  set of digitised content processed,
     184                  this online resource demonstrates how Greenstone3 can be used
     185                  to meet the aspirations of the InterMuSe project.
     186                  of the InterMusE project we have               
     187                </p>
     188
     189                <!--
     190                <p>
     191                  This prototype collection contains <xsl:value-of select="$numdocs"/> documents
     192                  focusing on a sample of programmes from the Huddersfield Music Society.
     193                </p>
     194                -->
     195               
     196                <p>
     197                  <!-- Linked Open Data is used to unify these resources. -->
     198                 
     199                  When content is added to the Digital Library it is automatically
     200                  processed using the Google Vision API, and any text extracted added
     201                  to the digital library's full-text index
     202                  as well as stored as Linked Open Data using the
     203                  <a href="https://dev.gdmrdigital.com/" target="_blank">Simple Annotation Server</a>.
     204                  We make the OCR'd text available asOpen Annotations,
     205                  accessible through a <a href="https://projectmirador.org/" target="_blank">Mirador3 Image Viewer</a>
     206                  embedded into the digital library.
     207                  Through the Mirador3 Viewer, annotations can be edited (correcting OCR errors, for example),
     208                  and well allowing for the
     209                  addition of complete new annotations (unrelated to the OCR'd text, if so desired).
     210                  As we use <a href="https://jena.apache.org/" target="_blank">Apache Jena Fuseki</a>
     211                  as the internal triplestore the Simple Annotation Server uses.
     212                  This means all the OCR'd content—along with
     213                  all the other metadata amassed in the digital library—can also be accessed via a SPARQL endpoint.
    159214                  More details are available through the
    160215                  <a href="https://intermuse.datatodata.org/" target="_blank">InterMuse project website</a>.
    161216                </p>
    162                  
     217
     218                <p>
     219                  In addition to the automatically generated OCR'd content,
     220                  through the InterMusE Project an Excel spreadsheet
     221                  has been painstakingly assembled from the programmes,
     222                  recording who the performers were, and which musical works they performed at what concert.
     223                  We fold this into the digital library collection, both as information to display, but
     224                  also as metadata that can be used to enrich how users can locate content of
     225                  interest to them in the collection.
     226                </p>
     227
     228                <h3>Designed for Different Types of User</h3>
    163229                <p>
    164230                  Use the browsing and searching features the digital library provides to locate content
    165                   of interest.  Register as a user  to become an annotator/editor of the content.
    166                   As a developer seeking to enrich the forms of access to this content,
    167                   a machine-readable version of the content is accessable through a
     231                  of interest.  Register as a user to become an annotator/editor of the content.
     232                  As an external developer, interested in further enriching the forms of access to this content,
     233                  a machine-readable version of the content is accessible through the following
    168234                  <a href="{$library_name}/collection/{$collName}/page/sparql">SPARQL endpoint</a>
    169235                </p>
    170                 <p>
    171                   <gslib:collectionDescriptionTextAndServicesLinks/>
     236                <p>
     237                  <gslib:collectionDescriptionTextAndServicesLinks/>
     238                </p>
     239
     240                <xsl:variable name="raw_date">
     241                  <gslib:collectionMeta name="buildDate"/>
     242                </xsl:variable>
     243                <xsl:variable name="formatted_date">
     244                  <xsl:value-of select="util:formatTimeStamp($raw_date, 0, 3, /page/@lang)"/>
     245                </xsl:variable>
     246                <xsl:variable name="numdocs">
     247                  <gslib:collectionMeta name="numDocs"/>
     248                </xsl:variable>
     249                <p>
     250                  This prototype collection contains <xsl:value-of select="$numdocs"/> documents focusing on a sample of programmes from the Huddersfield Music Society.
     251                  <!--
     252                  <xsl:value-of select="util:getInterfaceText($interface_name, /page/@lang, 'about.standarddescriptiondays', concat($numdocs, ';', $formatted_date))"/>
     253                  -->
     254                </p>
     255               
     256               
     257                <h3>Implementation Details</h3>
     258
     259                <p>
     260                  To form this prototype InterMusE digital library we have taken
     261                  the base digital library system and added in Greenstone's
     262                  extensions for:
     263                  <ul>
     264                <li>
     265                  <a href="https://trac.greenstone.org/browser/gs3-extensions/structured-image/trunk">structured-image</a>
     266                  to automatically perform OCR on programme pages using Google Vision's API;
     267                </li>
     268                <li>
     269                  <a href="https://trac.greenstone.org/browser/gs3-extensions/iiif-servlet/trunk/src">iiif-servlet</a>
     270                  to allow images in the digital library to be
     271                  available at a range of resolutions via the IIIF Image API; and
     272                </li>
     273                <li>
     274                  <a href="https://trac.greenstone.org/browser/gs2-extensions/apache-jena/trunk/src">apache-jena</a>
     275                  so content—such as annotations added to
     276                  programme pages—can be accessed as Linked Data.
     277                </li>
     278                  </ul>
     279                </p>
     280               
     281    <p>
     282      A key strength to the Greenstone3 software architecture is its
     283      ability to be customised, which is aligned with its three phases
     284      to forming a digital library collection: importing, building,
     285      and runtime presentation.  The first two phases typically go
     286      hand-in-hand, and form the ingest process by which content
     287      selected for the digital library collection is turned into a
     288      browseable and searchable online resource.
     289    </p>
     290    <p>
     291      Importing centres around a pipeline of document processing
     292      plugins, written in Perl, that turn a wide array of document
     293      and metadata formats into a canonical format known as
     294      GreenstoneXML.  Using one folder per document, this format
     295      represents everything that constitutes the processed document:
     296      the text and metadata of the document,
     297      along with any supporting files.  The internal format
     298      allows for hierarchical structure, such as occurs
     299      in Word, PDF, and HTML documents using headings.
     300      Metadata can be attached to any level of the hierarchy.
     301      Examples of associated files include automatically generated
     302      web-friendly resources such as
     303      <!-- as an MP3 version of
     304      a high quailty FLAC audio recording, for instance, -->
     305      screen-sized and thumbnail-sized images in the case
     306      of photos, embedded resources,  and the original file itself so it can be
     307      downloaded.
     308
     309      <!-- GreenstoneMETS -->
     310    </p>
     311    <p>
     312      In terms of customisation, plugins support a
     313      myriad of settings for fine-tuning how the processing is
     314      undertaken.  New plugins can also be introduced at any time,
     315      with the digital library system automatically detecting their
     316      presence.
     317    </p>
     318
     319    <p>
     320      The building step takes the standardised XML form, and processes
     321      it to form the backend indexes and database structures needed to
     322      deliver the forms of searching—such as full-text search, and
     323      search by title—and browsing—such as a hierarchical subject
     324      classification—specified in the collection's configuration file.
     325     
     326      Effectively the building phase turns the standardised/serialised
     327      GreenstoneXML form back into in-memory data-structures representing
     328      a document's hierarchical structure of text and metadata, along
     329      with how supporting files relate to that.
     330      Following the directives specified in the collection's
     331      configuration file, it is
     332      then a simple matter to transmit this text, metadata, and associated
     333      files as needed to the digital libraries indexing/database/backing-store.
     334      <!--
     335      so it can be used by the runtime system to provide the
     336     
     337      to be used by the runtime system
     338      -->
     339    </p>
     340    <p>     
     341      Beyond the customisations that can be specified in a collection
     342      configuration file for the building phase, Greenstone supports
     343      orthogonal indexers.  Like the document processing plugins used
     344      in importing, orthogonal indexers are modules written in Perl,
     345      and their inclusion is automatically detected by the Greenstone3
     346      installation.  Orthogonal indexers get presented the same of
     347      in-memory stream of &quot;reconstructed&quot; documents,
     348      allowing them to undertake additional processing if
     349      required (such as computing audio features), which can
     350      then be transmitted to specialist indexing/database/backing-store
     351      (such as a content-based music recommender system), or otherwise
     352      added to the existing indexing/database/backing-store.
     353    </p>
     354    <p>
     355      The third phase of the Greenstone3 digital library architecture
     356      governs how functionality is accessed and data is extracted from
     357      the digital library and presented to the user. The Greenstone3
     358      runtime is a service-based architecture, written in Java,
     359      consisting of a network of connected modules.  Modules are
     360      self-describing and advertise the services they offer.
     361      Communication between modules is by XML messages, with the
     362      service handling the final layer of communication responsible
     363      for presentation.  Here, XSL Transforms (XSLTs) are used to
     364      convert the underlying XML content into the web page displayed
     365      by the digital library, blending in CSS and Javascript
     366      files that control appearance and functionality.
     367      files.
     368    </p>
     369    <p>
     370      The XSLT files are grouped together in one place, forming the
     371      interface for the digital libary. An inheritance mechanism is
     372      deployed throughout this part of the design.  A collection can
     373      override individual XSLT template rules as required to tweak
     374      presentation details.  A collection can also provide an entire
     375      replacement XSLT file if so desired.  For more substantial
     376      changes a new interface is typically developed.
     377    </p>
     378
     379    <p>
     380     
     381      In terms of crafting the features and functionality to form this digital library,
     382      we made use of all three area of customisation. ...
     383     
     384<!--
     385      there
     386      are three keys parts of the Greenstone3 design where
     387     
     388      area where customisation
     389     
     390
     391      digital library collection into
     392     
     393                  the online resource
     394
     395                  features and functionality
     396                 
     397                  In developing a Greenstone3 digital library collection, there are three key phases
     398                  to consider: importing, building, and runtime-display.
     399
     400                  The first
     401                  XML message based .. XSL Transforms (XSLT)
     402                 
     403                  The there are three key phases to the
     404
     405                  Its modular design
     406                 
     407                  The modular design of Greenstone3 provides several stages where
     408                 
     409                  ...
     410
     411                  importing
     412                  building
     413                  runtime-display
     414                 
     415                  orthogonal indexes
     416-->
     417
     418                  <!--
     419                  Three key 'hook-in' points within the Greenstone3 software architecture
     420                  for customisation are: the Perl-based document processing plugins
     421                  used in the content ingest pipeline, through which
     422                  content and metadata are ingested into a digital library
     423                  collection
     424                 
     425                  Perl-based document processing pipeline
     426                 
     427                  Woven together in the following way
     428                 
     429                  We have applied
     430                 
     431                  Mirador
     432                  SimpleAnnotationServer
     433                 
     434                  This forms the framework for this developed
     435                 
     436                  In developing this online resource, we have applied it
     437                  -->
    172438                </p>
    173                 <xsl:variable name="raw_date">
    174                     <gslib:collectionMeta name="buildDate"/>
    175                 </xsl:variable>
    176                 <xsl:variable name="formatted_date">
    177                     <xsl:value-of select="util:formatTimeStamp($raw_date, 0, 3, /page/@lang)"/>
    178                 </xsl:variable>
    179                 <xsl:variable name="numdocs">
    180                     <gslib:collectionMeta name="numDocs"/>
    181                 </xsl:variable>
    182                 <p>
    183                   This prototype collection contains <xsl:value-of select="$numdocs"/> documents.
    184                   <!--
    185                     <xsl:value-of select="util:getInterfaceText($interface_name, /page/@lang, 'about.standarddescriptiondays', concat($numdocs, ';', $formatted_date))"/>
    186                   -->
    187                 </p>
    188             </xsl:template>
     439
     440               
     441                  </xsl:template>
    189442        </format>
    190443    <search type="solr">
     
    297550    </search>
    298551
     552    <!--
    299553    <search type="jenaTDB" orthogonal="true"/>
    300 
     554    -->
     555   
    301556    <infodb type="jdbm"/>
    302557   
     
    354609    </import>
    355610    <browse>
    356         <classifier name="List">
    357             <option name="-metadata" value="dc.Title,Title"/>
    358             <option name="-partition_type_within_level" value="approximate_size"/>
    359             <option name="-numeric_partition_type_within_level" value="approximate_size"/>
    360             <option name="-metadata_selection_mode_within_level" value="firstvalue"/>
     611      <classifier name="List">
     612        <!--
     613          <option name="-metadata" value="dc.Title,Title"/>
     614        -->
     615          <option name="-metadata" value="Volume"/>
     616          <option name="-buttonname" value="Programmes"/>
     617          <option name="-partition_type_within_level" value="approximate_size"/>
     618          <option name="-numeric_partition_type_within_level" value="approximate_size"/>
     619          <option name="-metadata_selection_mode_within_level" value="firstvalue"/>
    361620
    362621            <format>
     
    415674
    416675        <classifier name="AZCompactList">
    417             <option name="-metadata" value="ex.im.Work,Title"/>
     676            <option name="-metadata" value="ex.im.Work"/>
    418677            <option name="-buttonname" value="Performances"/>
    419678            <option name="-mingroup" value="2"/>
    420679        </classifier>
     680        <!--
    421681        <classifier name="AZCompactList">
    422682            <option name="-metadata" value="ex.im.No"/>
    423683            <option name="-buttonname" value="IntermusE IDs"/>
    424684        </classifier>
     685        -->
    425686        <classifier name="AZCompactList">
    426687            <option name="-metadata" value="ex.im.Artist"/>
     
    444705                <td valign="top">
    445706                    <gsf:link type="document">
    446                         <gsf:choose-metadata>
     707                            <gsf:choose-metadata>
    447708                            <gsf:metadata name="thumbicon"/>
    448709                            <gsf:metadata name="srcicon"/>
     
    568829                    <gsf:metadata name="ex.im.Venue"      >Venue:         </gsf:metadata>
    569830                    <gsf:metadata name="ex.im.Date"       >Date:          </gsf:metadata>
    570                     <gsf:metadata name="ex.im.ProgAndPage">Programme/Page:</gsf:metadata>
     831                    <gsf:metadata name="ex.im.ProgAndPage">Programme/Concert:</gsf:metadata>
    571832                </gsf:metadata-table>
    572833            </xsl:template>
Note: See TracChangeset for help on using the changeset viewer.