1 | <?xml version="1.0" encoding="utf-8" standalone="no"?>
|
---|
2 | <!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">
|
---|
3 | <Archive>
|
---|
4 | <Section>
|
---|
5 | <Description>
|
---|
6 | <Metadata name="gsdldoctype">indexed_doc</Metadata>
|
---|
7 | <Metadata name="Language">en</Metadata>
|
---|
8 | <Metadata name="Encoding">utf8</Metadata>
|
---|
9 | <Metadata name="URL">http://Scratch/ak19/gs2-diffcol-26Jul2021/collect/Enhanced-PDF/tmp/1678156845_4/pdf01.html</Metadata>
|
---|
10 | <Metadata name="UTF8URL">http://Scratch/ak19/gs2-diffcol-26Jul2021/collect/Enhanced-PDF/tmp/1678156845_4/pdf01.html</Metadata>
|
---|
11 | <Metadata name="Title">Greenstone: A Comprehensive Open-Source Digital Library Software System Ian H. Witten,* Rodger J....</Metadata>
|
---|
12 | <Metadata name="gsdlsourcefilename">import/pdf01.pdf</Metadata>
|
---|
13 | <Metadata name="gsdlsourcefilerenamemethod">url</Metadata>
|
---|
14 | <Metadata name="gsdlconvertedfilename">tmp/1678156845_4/pdf01.html</Metadata>
|
---|
15 | <Metadata name="OrigSource">pdf01.html</Metadata>
|
---|
16 | <Metadata name="Source">pdf01.pdf</Metadata>
|
---|
17 | <Metadata name="SourceFile">pdf01.pdf</Metadata>
|
---|
18 | <Metadata name="Plugin">PDFv1Plugin</Metadata>
|
---|
19 | <Metadata name="FileSize">269487</Metadata>
|
---|
20 | <Metadata name="FilenameRoot">pdf01</Metadata>
|
---|
21 | <Metadata name="FileFormat">PDF</Metadata>
|
---|
22 | <Metadata name="srcicon">_iconpdf_</Metadata>
|
---|
23 | <Metadata name="srclink_file">doc.pdf</Metadata>
|
---|
24 | <Metadata name="srclinkFile">doc.pdf</Metadata>
|
---|
25 | <Metadata name="NumPages">9</Metadata>
|
---|
26 | <Metadata name="gsdlthistype">Paged</Metadata>
|
---|
27 | <Metadata name="ex.ExifTool.ExifToolVersion">12.19</Metadata>
|
---|
28 | <Metadata name="ex.File.Directory">/Scratch/ak19/gs2-diffcol-26Jul2021/collect/Enhanced-PDF/import</Metadata>
|
---|
29 | <Metadata name="ex.File.FileAccessDate">2023:03:07 15:39:59+13:00</Metadata>
|
---|
30 | <Metadata name="ex.File.FileInodeChangeDate">2023:03:07 15:39:59+13:00</Metadata>
|
---|
31 | <Metadata name="ex.File.FileModifyDate">2023:03:07 15:39:59+13:00</Metadata>
|
---|
32 | <Metadata name="ex.File.FileName">pdf01.pdf</Metadata>
|
---|
33 | <Metadata name="ex.File.FilePermissions">100664</Metadata>
|
---|
34 | <Metadata name="ex.File.FileSize">269487</Metadata>
|
---|
35 | <Metadata name="ex.File.FileType">PDF</Metadata>
|
---|
36 | <Metadata name="ex.File.FileTypeExtension">PDF</Metadata>
|
---|
37 | <Metadata name="ex.File.MIMEType">application/pdf</Metadata>
|
---|
38 | <Metadata name="ex.PDF.Author">Bronwyn</Metadata>
|
---|
39 | <Metadata name="ex.PDF.CreateDate">2000:03:02 15:21:24</Metadata>
|
---|
40 | <Metadata name="ex.PDF.Creator">Microsoft Word</Metadata>
|
---|
41 | <Metadata name="ex.PDF.Linearized">false</Metadata>
|
---|
42 | <Metadata name="ex.PDF.PDFVersion">1.2</Metadata>
|
---|
43 | <Metadata name="ex.PDF.PageCount">9</Metadata>
|
---|
44 | <Metadata name="ex.PDF.Producer">Acrobat PDFWriter 4.0 for Power Macintosh</Metadata>
|
---|
45 | <Metadata name="Identifier">HASH1a9cea0f239f754007681b</Metadata>
|
---|
46 | <Metadata name="lastmodified">1678156799</Metadata>
|
---|
47 | <Metadata name="lastmodifieddate">20230307</Metadata>
|
---|
48 | <Metadata name="oailastmodified">1678156845</Metadata>
|
---|
49 | <Metadata name="oailastmodifieddate">20230307</Metadata>
|
---|
50 | <Metadata name="assocfilepath">HASH1a9c.dir</Metadata>
|
---|
51 | <Metadata name="gsdlassocfile">pdf01-2_1.jpg:image/jpeg:</Metadata>
|
---|
52 | <Metadata name="gsdlassocfile">pdf01-3_1.jpg:image/jpeg:</Metadata>
|
---|
53 | <Metadata name="gsdlassocfile">pdf01-4_1.jpg:image/jpeg:</Metadata>
|
---|
54 | <Metadata name="gsdlassocfile">pdf01-5_1.jpg:image/jpeg:</Metadata>
|
---|
55 | <Metadata name="gsdlassocfile">pdf01-7_1.jpg:image/jpeg:</Metadata>
|
---|
56 | <Metadata name="gsdlassocfile">pdf01-8_1.jpg:image/jpeg:</Metadata>
|
---|
57 | <Metadata name="gsdlassocfile">doc.pdf:application/pdf:</Metadata>
|
---|
58 | </Description>
|
---|
59 | <Content>
|
---|
60 |
|
---|
61 |
|
---|
62 |
|
---|
63 |
|
---|
64 |
|
---|
65 |
|
---|
66 |
|
---|
67 |
|
---|
68 |
|
---|
69 |
|
---|
70 |
|
---|
71 |
|
---|
72 |
|
---|
73 |
|
---|
74 |
|
---|
75 |
|
---|
76 |
|
---|
77 |
|
---|
78 |
|
---|
79 | </Content>
|
---|
80 | <Section>
|
---|
81 | <Description>
|
---|
82 | <Metadata name="Title">1</Metadata>
|
---|
83 | </Description>
|
---|
84 | <Content><br />
|
---|
85 | <b>Greenstone: A Comprehensive Open-Source</b><br>
|
---|
86 | <b>Digital Library Software System</b><br>
|
---|
87 | <i>Ian H. Witten,* Rodger J. McNab,â Stefan J. Boddie,* David Bainbridge*</i><br>
|
---|
88 | * Dept of Computer Science<br>
|
---|
89 | â Digilib Systems<br>
|
---|
90 | University of Waikato, New Zealand<br>
|
---|
91 | Hamilton, New Zealand<br>
|
---|
92 | E-mail: {ihw, sjboddie, davidb}@cs.waikato.ac.nz<br>
|
---|
93 | E-mail: [email protected]<br>
|
---|
94 | <b>ABSTRACT</b><br>
|
---|
95 | multilingual information retrieval to distributed computing<br>protocols, from interoperability to search engine<br>
|
---|
96 | This paper describes the Greenstone digital library<br>
|
---|
97 | technology, from metadata standards to multiformat<br>
|
---|
98 | software, a comprehensive, open-source system for the<br>
|
---|
99 | document parsing, from multimedia to multiple operating<br>
|
---|
100 | construction and presentation of information collections.<br>
|
---|
101 | systems, from Web browsers to plug-and-play DVDs.<br>
|
---|
102 | Collections built with Greenstone offer effective full-text<br>searching and metadata-based browsing facilities that are<br>
|
---|
103 | The Greenstone Digital Library Software from the New<br>
|
---|
104 | attractive and easy to use. Moreover, they are easily<br>
|
---|
105 | Zealand Digital Library (NZDL) project tackles this issue<br>
|
---|
106 | maintainable and can be augmented and rebuilt entirely<br>
|
---|
107 | by providing a new way of organizing information and<br>
|
---|
108 | automatically. The system is extensible: software<br>
|
---|
109 | making it available over the Internet. A <i>collection</i> of<br>
|
---|
110 | âpluginsâ accommodate different document and metadata<br>
|
---|
111 | information comprises several (typically several thousand,<br>
|
---|
112 | types.<br>
|
---|
113 | or several million) <i>documents</i>, and a uniform interface is<br>provided to all documents in a collection. A library may<br>
|
---|
114 | <b>INTRODUCTION</b><br>
|
---|
115 | include many different collections, each organized<br>differentlyâthough there is a strong family resemblance in<br>
|
---|
116 | Notwithstanding intense research activity in the digital<br>
|
---|
117 | how collections are presented.<br>
|
---|
118 | library field during the second half of the 1990s,<br>comprehensive software systems for creating digital<br>
|
---|
119 | Making information available using this system is far more<br>
|
---|
120 | libraries are not widely available. In fact, the usual solution<br>
|
---|
121 | than âjust putting it on the Web.â The collection becomes<br>
|
---|
122 | when creating a digital library is also the most<br>
|
---|
123 | maintainable, searchable, and browsable. Each collection,<br>
|
---|
124 | obviousâjust put it on the Web. But consider how much<br>
|
---|
125 | prior to presentation, undergoes a âbuildingâ process that,<br>
|
---|
126 | effort is involved in constructing a Web site for a digital<br>
|
---|
127 | once established, is completely automatic. This process<br>
|
---|
128 | library. To be effective it needs to be visually attractive<br>
|
---|
129 | creates all the structures that are used at run-time for<br>
|
---|
130 | and ergonomically easy to use, incorporate convenient and<br>
|
---|
131 | accessing the collection. Searching is based on various<br>
|
---|
132 | powerful searching capabilities, and offer rich and natural<br>
|
---|
133 | indexes, while browsing is based on various metadata;<br>
|
---|
134 | browsing facilities. Above all it must be easy to maintain<br>
|
---|
135 | support structures for both are created during the building<br>
|
---|
136 | and augment, which presents a significant challenge if any<br>
|
---|
137 | operation. When new material appears it can be fully<br>
|
---|
138 | manual organization is involved.<br>
|
---|
139 | incorporated into the collection by rebuilding.<br>
|
---|
140 | The alternative is to automate these activities through<br>
|
---|
141 | To address the exceptionally broad demands of digital<br>
|
---|
142 | software tools. But the broad scope of digital library<br>
|
---|
143 | libraries, the system is public and extensible. It is issued<br>
|
---|
144 | requirements makes this a daunting prospect. Ideally the<br>
|
---|
145 | under the Gnu public license and, in the spirit of open-<br>
|
---|
146 | software should incorporate facilities ranging from<br>
|
---|
147 | source software, users are invited to contribute<br>modifications and enhancements. Only through an<br>international cooperative effort will digital library software<br>become sufficiently comprehensive to meet the worldâs<br>needs. Currently the Greenstone software is used at sites in<br>Canada, Germany, New Zealand, Romania, UK, and the<br>US, and collections range from newspaper articles to<br>technical documents, from educational journals to oral<br>history, from visual art to folksongs. The software has<br>been used for collections in many different languages, and<br>for CD-ROMs that have been published by the United<br>Nations and other humanitarian agencies in Belgium,<br>France, Japan, and the US for distribution in developing<br>countries (Humanity Libraries, 1998; PAHO, 1999;<br>UNESCO, 1999; UNU, 1998). Further details can be<br>obtained from <i>www.nzdl.org</i>.<br>
|
---|
148 | <hr>
|
---|
149 | </Content>
|
---|
150 | </Section>
|
---|
151 | <Section>
|
---|
152 | <Description>
|
---|
153 | <Metadata name="Title">2</Metadata>
|
---|
154 | </Description>
|
---|
155 | <Content><br />
|
---|
156 | <IMG src="_httpdocimg_/pdf01-2_1.jpg"><br>
|
---|
157 | become a first-class component of the library. And what<br>permits it to be integrated into existing searching and<br>browsing structures without any manual intervention is<br><i>metadata</i>. This provides sufficient focus to the concept of<br>âdigital libraryâ to support the development of a<br>construction kit.<br>
|
---|
158 | <b>OVERVIEW OF GREENSTONE</b><br>
|
---|
159 | <br>Information collections built by Greenstone combine<br>extensive full-text search facilities with browsing indexes<br>based on different metadata types. There are several ways<br>for users to find information, although they differ between<br>collections depending on the metadata available and the<br>collection design. Typically you can <i>search for particular<br>words</i> that appear in the text, or within a section of a<br>document, or within a title or section heading. You can<br><i>browse documents by title</i>: just click on the displayed book<br>icon to read it. You can <i>browse documents by subject</i>.<br>Subjects are represented by bookshelves: just click on a<br>shelf to see the books. Where appropriate, documents<br>
|
---|
160 | <b>Figure 1: Searching the HDL collection</b><br>
|
---|
161 | come complete with a table of contents (constructed<br>automatically): you can click on a chapter or subsection to<br>
|
---|
162 | This paper sets the scene with a brief discussion of what a<br>
|
---|
163 | open it, expand the full table of contents, or expand the full<br>
|
---|
164 | digital library is. We then give an overview of the facilities<br>
|
---|
165 | document.<br>
|
---|
166 | offered by Greenstone and show how end users find<br>information in collections. Next we describe the files and<br>
|
---|
167 | <br>An example of searching is shown in Figure 1 where<br>
|
---|
168 | directories involved in a collection, and then discuss the<br>
|
---|
169 | documents in the Global Help Projectâs Humanity<br>
|
---|
170 | processes of updating existing collections and creating new<br>
|
---|
171 | Development Library (HDL) are being searched for<br>
|
---|
172 | ones, including extending the software to provide new<br>
|
---|
173 | chapters matching the word <i>butterfly</i>. In Figure 2 the same<br>
|
---|
174 | facilities. We conclude with an overview of related work.<br>
|
---|
175 | collection is being browsed by subject: by clicking on the<br>bookshelf icons the user has discovered an item under<br>
|
---|
176 | <b>WHAT IS A DIGITAL LIBRARY?</b><br>
|
---|
177 | Section 16, Animal Husbandry. Pursuing an interest in<br>butterfly farming, the user selects a book by clicking on its<br>
|
---|
178 | <br>Ten definitions of the term âdigital libraryâ have been<br>
|
---|
179 | book icon. In Figure 3 the front cover of the book is<br>
|
---|
180 | culled from the literature by Fox (1998), and their spirit is<br>
|
---|
181 | displayed as a graphic on the left, and the automatically<br>
|
---|
182 | captured in the following brief characterization:<br>
|
---|
183 | constructed table of contents appears at the start of the<br>
|
---|
184 | <br>
|
---|
185 | document. The current focus, <i>Introduction and Summary</i>,<br>
|
---|
186 | <i>A collection of digital objects, including text,</i><br>
|
---|
187 | is shown in bold in the table of contents with its text<br>
|
---|
188 | <i>video, and audio, along with methods for access</i><br>
|
---|
189 | starting further down the page.<br>
|
---|
190 | <i>and retrieval, and for selection, organization<br>and maintenance of the collection</i><br>
|
---|
191 | <br>In accordance with Leskâs advice, a statement of purpose<br>
|
---|
192 | <br>
|
---|
193 | and coverage accompanies each collection, along with an<br>
|
---|
194 | (Akscyn and Witten, 1998). Lesk (1998) views digital<br>
|
---|
195 | explanation of how it is organized (Figure 1 shows the<br>
|
---|
196 | libraries as âorganized collections of digital information,â<br>
|
---|
197 | start of this). A distinction is made between <i>searching</i> and<br>
|
---|
198 | and wisely recommends that they articulate the principles<br>
|
---|
199 | <i>browsing</i>. Searching is full-text, andâdepending on the<br>
|
---|
200 | governing what is included and how the collection is<br>
|
---|
201 | collectionâs designâthe user can choose between indexes<br>
|
---|
202 | organized.<br>
|
---|
203 | built from different parts of the documents, or from<br>
|
---|
204 | <br>Digital libraries are generally distinguished from the<br>
|
---|
205 | different metadata. Some collections have an index of full<br>
|
---|
206 | World-Wide Web, the essential difference being in<br>
|
---|
207 | documents, an index of sections, an index of paragraphs,<br>
|
---|
208 | selection and organization. But they are not generally<br>
|
---|
209 | an index of titles, and an index of section headings, each of<br>
|
---|
210 | distinguished from a web <i>site</i>: indeed, virtually all extant<br>
|
---|
211 | which can be searched for particular words or phrases.<br>
|
---|
212 | digital libraries manifest themselves as a web site. Hence<br>
|
---|
213 | Browsing involves data structures created from metadata<br>
|
---|
214 | the obvious question: to make a digital library, why not<br>
|
---|
215 | that the user can examine: lists of authors, lists of titles,<br>
|
---|
216 | just put the information on the Web?<br>
|
---|
217 | lists of dates, hierarchical classification structures, and so<br>
|
---|
218 | <br>
|
---|
219 | on. Data structures for both browsing and searching are<br>
|
---|
220 | But we make a distinction between a digital library and a<br>
|
---|
221 | built according to instructions in a configuration file,<br>
|
---|
222 | web site that lies at the heart of our software design: one<br>
|
---|
223 | which controls both building and serving the collection.<br>
|
---|
224 | should easily be able to add new material to a library<br>
|
---|
225 | Sample configuration files are discussed below.<br>
|
---|
226 | without having to integrate it manually or edit its content<br>in any way. Once added, new material should immediately<br>
|
---|
227 | <hr>
|
---|
228 | </Content>
|
---|
229 | </Section>
|
---|
230 | <Section>
|
---|
231 | <Description>
|
---|
232 | <Metadata name="Title">3</Metadata>
|
---|
233 | </Description>
|
---|
234 | <Content><br />
|
---|
235 | <IMG src="_httpdocimg_/pdf01-3_1.jpg"><br>
|
---|
236 | matter of specifying all the necessary plugins. In order to<br>build browsing indexes from metadata, an analogous<br>scheme of âclassifiersâ is used: classifiers create indexes<br>of various kinds based on metadata. Source documents are<br>brought into the Greenstone system through a process<br>called <i>importing</i>, which uses the plugins and classifiers<br>specified in the collection configuration file.<br>
|
---|
237 | <br>The international Unicode character set is used throughout,<br>so documentsâand interfacesâcan be written in any<br>language. Collections have so far been produced in<br>English, French, Spanish, German, Maori, Chinese, and<br>Arabic. The NZDL Web site provides numerous examples.<br>Collections can contain text, pictures, and even audio and<br>video clips; a text-only version of the interface is also<br>provided to accommodate visually impaired users.<br>Compression technology is used to ensure best use of<br>storage (Witten <i>et al </i>., 1999). Most non-textual material is<br>either linked to textual documents or accompanied by<br>textual descriptions (such as photo captions) to allow full-<br>text searching and browsing. However, the architecture<br>
|
---|
238 | <b>Figure 2: Browsing the HDL collection by subject</b><br>
|
---|
239 | permits the implementation of plugins and classifiers even<br>for non-textual data.<br>
|
---|
240 | <br>Rich browsing facilities can be provided by manually<br>
|
---|
241 | <br>
|
---|
242 | linking parts of documents together and building explicit<br>
|
---|
243 | The system includes an âadministrativeâ function whereby<br>
|
---|
244 | indexes and tables of contents. However, manually-created<br>
|
---|
245 | specified users can examine the composition of all<br>
|
---|
246 | linking becomes difficult to maintain, and often falls into<br>
|
---|
247 | collections, protect documents so that they can only be<br>
|
---|
248 | disrepair when a collection expands. The Greenstone<br>
|
---|
249 | accessed by registered users on presentation of a password,<br>
|
---|
250 | software takes a different tack: it facilitates <i>maintainability</i><br>
|
---|
251 | and so on. Logs of user activity are kept that record all<br>
|
---|
252 | by creating all searching and browsing structures<br>
|
---|
253 | queries made to every Greenstone collection (though this<br>
|
---|
254 | automatically from the documents themselves. No links<br>
|
---|
255 | facility can be disabled).<br>
|
---|
256 | are inserted by hand. This means that when new<br>
|
---|
257 | <br>Although primarily designed for Internet access over the<br>
|
---|
258 | documents in the same format become available, they can<br>
|
---|
259 | World-Wide Web, collections can be made available, in<br>
|
---|
260 | be added automatically. Indeed, for some collections this is<br>
|
---|
261 | precisely the same form, on CD-ROM. In either case they<br>
|
---|
262 | done by processes that wake up regularly, scout for new<br>
|
---|
263 | are accessed through any Web browser. Greenstone CD-<br>
|
---|
264 | material, and rebuild the indexesâall without manual<br>
|
---|
265 | ROMs operate on a standalone PC under Windows 3.X,<br>
|
---|
266 | intervention.<br>
|
---|
267 | 95, 98, and NT, and the interaction is identical to accessing<br>
|
---|
268 | Collections comprise many documents: thousands, tens of<br>
|
---|
269 | the collection on the Webâexcept that response is faster<br>
|
---|
270 | thousands, or even millions. Each document may be<br>
|
---|
271 | and more predictable. The requirement to operate on early<br>
|
---|
272 | hierarchically organized into <i>sections</i> (subsections, sub-<br>
|
---|
273 | Windows systems is one that plagues the software design,<br>
|
---|
274 | subsections, and so on). Each section comprises one or<br>
|
---|
275 | but is crucial for many usersâparticularly those in<br>
|
---|
276 | more <i>paragraphs</i>. Metadata such as author, title, date,<br>
|
---|
277 | underdeveloped countries seeking access to humanitarian<br>
|
---|
278 | keywords, and so on, may be associated with documents,<br>
|
---|
279 | aid collections. If the PC is connected to a network<br>
|
---|
280 | or with individual sections of documents. This is the raw<br>
|
---|
281 | (intranet or Internet), a custom-built Web server provided<br>
|
---|
282 | material for indexes. It must either be provided explicitly<br>
|
---|
283 | on each CD makes exactly the same information available<br>
|
---|
284 | for each document and section (for example, in an<br>
|
---|
285 | to others through their standard Web browser. The use of<br>
|
---|
286 | accompanying spreadsheet) or be derivable automatically<br>
|
---|
287 | compression ensures that the greatest possible volume of<br>
|
---|
288 | from the source documents. Metadata is converted to<br>
|
---|
289 | information can be packed on to a CD-ROM.<br>
|
---|
290 | Dublin Core and stored with the document for internal use.<br>
|
---|
291 | <br>The collection-serving software operates under Unix and<br>
|
---|
292 | <br>In order to accommodate different kinds of source<br>
|
---|
293 | Windows NT, and works with standard Web servers. A<br>
|
---|
294 | documents, the software is organized so that âpluginsâ can<br>
|
---|
295 | flexible process structure allows different collections to be<br>
|
---|
296 | be written for new document types. Plugins exist for plain<br>
|
---|
297 | served by different computers, yet be presented to the user<br>
|
---|
298 | text documents, HTML documents, email documents, and<br>
|
---|
299 | in the same way, on the same Web page, as part of the<br>
|
---|
300 | bibliographic formats. Word documents are handled by<br>
|
---|
301 | same digital library, even as part of the same collection<br>
|
---|
302 | saving them as HTML; PostScript ones by applying a<br>
|
---|
303 | (McNab and Witten, 1998). Existing collections can be<br>
|
---|
304 | preprocessor (Nevill-Manning <i>et al</i>., 1998). Specially<br>
|
---|
305 | updated and new ones brought on-line at any time, without<br>
|
---|
306 | written plugins also exist for proprietary formats such as<br>
|
---|
307 | bringing the system down; the process responsible for the<br>
|
---|
308 | that used by the BBC archives department. A collection<br>
|
---|
309 | user interface will notice (through periodic polling) when<br>
|
---|
310 | may have source documents in different forms: it is just a<br>
|
---|
311 | new collections appear and add them to the list presented<br>to the user.<br>
|
---|
312 | <hr>
|
---|
313 | </Content>
|
---|
314 | </Section>
|
---|
315 | <Section>
|
---|
316 | <Description>
|
---|
317 | <Metadata name="Title">4</Metadata>
|
---|
318 | </Description>
|
---|
319 | <Content><br />
|
---|
320 | <IMG src="_httpdocimg_/pdf01-4_1.jpg"><br>
|
---|
321 | <b>FILES IN A COLLECTION</b><br>
|
---|
322 | <br>When a new collection is created or material is added to an<br>existing one, the original source documents are first<br>brought into the system through a process known as<br>âimporting.â This involves converting documents into a<br>simple HTML-like format known as GML (for<br>âGreenstone Markup Languageâ), which includes any<br>metadata associated with the document. Documents are<br>assumed to be in the Unicode UTF-8 code (of which the<br>ASCII characters form a subset).<br>
|
---|
323 | <br><b>Files and directories</b><br>
|
---|
324 | <br>There is a separate directory for each collection, which<br>contains five subdirectories: the original raw material<br>(<i>import</i>), the GML files created from this (<i>archives</i>), the<br>final collection as it is served to users (<i>index</i>), a directory<br>for use during the building process (<i>building</i>), and one for<br>any supporting files (<i>etc</i>)âincluding the configuration file<br>
|
---|
325 | <b>Figure 3: Reading a book in the HDL</b><br>
|
---|
326 | that controls the collection creation procedure. Additional<br>files might be required: for example, building a hierarchy<br>of classifications requires a data file of sub-classifications.<br>
|
---|
327 | <b>FINDING INFORMATION</b><br>
|
---|
328 | <br>Greenstone digital library systems generally include<br>
|
---|
329 | <br>
|
---|
330 | several separate collections. A home page allows you to<br>
|
---|
331 | <b>The imported documents</b><br>
|
---|
332 | select a collection; in addition, each collection has its own<br>
|
---|
333 | <br>In order to identify documents internally, a unique object<br>
|
---|
334 | âaboutâ page that gives you information about how the<br>
|
---|
335 | identifier or OID is assigned to each original source<br>
|
---|
336 | collection is organized and the principles governing what<br>
|
---|
337 | document when it is imported (formed by hashing the<br>
|
---|
338 | is included.<br>
|
---|
339 | content, to overcome file duplication effects caused by<br>
|
---|
340 | <br>All icons in the screenshots of Figures 1â4 are clickable.<br>
|
---|
341 | mirroring) and stored as metadata within that document. It<br>
|
---|
342 | Those icons at the top of the page return to the home page,<br>
|
---|
343 | is important that OIDs persist throughout the index-<br>
|
---|
344 | provide help text, and allow you to set user interface and<br>
|
---|
345 | building processâso that a userâs search history is<br>
|
---|
346 | searching preferences. The navigation bar underneath<br>
|
---|
347 | unaffected by rebuilding the collection. OIDs are assigned<br>
|
---|
348 | gives access to the searching and browsing facilities,<br>
|
---|
349 | by hashing the contents of the original source document.<br>
|
---|
350 | which differ from one collection to another.<br>
|
---|
351 | <br>Once imported, each document is stored in its own<br>
|
---|
352 | <br>Each of the five buttons provides a different way to find<br>
|
---|
353 | subdirectory of <i>archives</i>, along with any associated<br>
|
---|
354 | information. You can <i>search for particular words</i> that<br>
|
---|
355 | filesâfor example, images. To ensure compatibility with<br>
|
---|
356 | appear in the text from the âsearchâ page (or from the<br>
|
---|
357 | Windows 3.0, only eight characters are used in directory<br>
|
---|
358 | âaboutâ page of Figure 1). This collection contains indexes<br>
|
---|
359 | and file names, which causes annoying but essentially<br>
|
---|
360 | of chapters, section titles, and entire books. The default<br>
|
---|
361 | trivial complications.<br>
|
---|
362 | search interface is a simple one, suitable for casual users;<br>advanced searchingâwhich allows full Boolean<br>
|
---|
363 | <br><b>Inside the documents</b><br>
|
---|
364 | expressions, phrase searching, case and stemming<br>controlâcan be enabled from the <i>Preferences</i> page.<br>
|
---|
365 | <br>The GML format imposes a limited amount of structure on<br>
|
---|
366 | <br>
|
---|
367 | documents. Documents are divided into paragraphs. They<br>
|
---|
368 | This collection has four browsable metadata indexes. You<br>
|
---|
369 | can be split hierarchically into sections and subsections.<br>
|
---|
370 | can <i>access publications by subject</i> by clicking the <i>subjects</i><br>
|
---|
371 | OIDs are extended to identify these components by<br>
|
---|
372 | button, which brings up a list of subjects, represented by<br>
|
---|
373 | appending numbers, separated by periods, to a documentâs<br>
|
---|
374 | bookshelves (Figure 2). You can <i>access publications by</i><br>
|
---|
375 | OID. When a book is read, its section hierarchy is visible<br>
|
---|
376 | <i>title</i> by clicking <i>titles a-z</i> (Figure 4), which brings up a list<br>
|
---|
377 | as the table of contents (Figure 3). Chapters, sections,<br>
|
---|
378 | of books in alphabetic order. You can <i>access publications</i><br>
|
---|
379 | subsections, and pages are all implemented simply as<br>
|
---|
380 | <i>by organization</i> (i.e. Dublin Core âpublisherâ), bringing up<br>
|
---|
381 | âsectionsâ within the document. In some collections<br>
|
---|
382 | a list of organizations. You can <i>access publications by</i><br>
|
---|
383 | documents do not have a hierarchical subsection structure,<br>
|
---|
384 | <i>âhow toâ listing</i>, yielding a list of hints defined by the<br>
|
---|
385 | but are split into pages to permit browsing within a<br>
|
---|
386 | collectionâs editors. We use the Dublin Core as a base and<br>
|
---|
387 | retrieved document.<br>
|
---|
388 | extend it in an <i>ad hoc</i> manner to accommodate the<br>individual requirements of collection designers.<br>
|
---|
389 | <br>The document structure is used for searchable indexes.<br>There are three levels of index: <i>documents</i>, <i>sections</i>, and<br>
|
---|
390 | <hr>
|
---|
391 | </Content>
|
---|
392 | </Section>
|
---|
393 | <Section>
|
---|
394 | <Description>
|
---|
395 | <Metadata name="Title">5</Metadata>
|
---|
396 | </Description>
|
---|
397 | <Content><br />
|
---|
398 | <IMG src="_httpdocimg_/pdf01-5_1.jpg"><br>
|
---|
399 | the <i>import</i> process is invoked, which converts the files into<br>GML using the specified plugins. Old material for which<br>GML files have previously been created is not re-imported.<br>Then the <i>build</i> process is invoked to build the requisite<br>indexes for the collection. Finally, the contents of the<br><i>building</i> directory are moved into the <i>index</i> directory, and<br>the new version of the collection automatically becomes<br>live.<br>
|
---|
400 | <br>This procedure may seem cumbersome. But all the steps<br>are necessary for efficient operation with large collections.<br>The <i>import</i> process could be performed on the fly during<br>the building operationâbut because building indexes is a<br>multipass operation, the often lengthy importing would be<br>repeated several times. The <i>build</i> process can take<br>considerable timeâa day or two, for very large<br>collections. Consequently, the results are placed in the<br><i>building</i> directory so that, if the collection already exists, it<br>will continue to be served to users in its old form<br>throughout the building operation.<br>
|
---|
401 | <br>Active users of the collection will not be disturbed when<br>the new version becomes liveâthey will probably not<br>
|
---|
402 | <b>Figure 4: Browsing titles in the HDL</b><br>
|
---|
403 | even notice. The persistent OIDs ensure that interactions<br>remain coherentâusers who are examining the results of a<br>query or browse operation will still retrieve the expected<br>
|
---|
404 | <i>paragraphs</i>, corresponding to the distinctions that GML<br>
|
---|
405 | documentsâand if a search is actually in progress when<br>
|
---|
406 | makesâthe hierarchical structure is flattened for the<br>
|
---|
407 | the change takes place the program detects the resulting<br>
|
---|
408 | purposes of creating these indexes. Indexes can be of text,<br>
|
---|
409 | file-structure inconsistency and automatically and<br>
|
---|
410 | or metadata, or any combination. Thus you can create a<br>
|
---|
411 | transparently re-executes the query, this time on the new<br>
|
---|
412 | searchable index of section titles, and/or authors, and/or<br>
|
---|
413 | version of the collection.<br>
|
---|
414 | document descriptions, as well as the document text.<br>
|
---|
415 | <b>UPDATING EXISTING COLLECTIONS</b><br>
|
---|
416 | <br><b>How it works</b><br>
|
---|
417 | <br>Updating an existing collection with new files in the same<br>
|
---|
418 | <br>The original material in the <i>import</i> directory may be in any<br>
|
---|
419 | format is easy. For example, the raw material for the HDL<br>
|
---|
420 | format, and plugins are required to process each format<br>
|
---|
421 | is supplied in the form of HTML files marked up with<br>
|
---|
422 | type. The plugins that a collection uses must be specified<br>
|
---|
423 | &lt;&lt;TOC&gt;&gt; tags to split books into sections and<br>
|
---|
424 | in the collection configuration file. The <i>import</i> program<br>
|
---|
425 | subsections, and &lt;&lt;I&gt;&gt; tags to indicate where an image is<br>
|
---|
426 | reads the list of plugins and passes each document to each<br>
|
---|
427 | to be inserted. For each book in the library there is a<br>
|
---|
428 | plugin in order until it finds one that can process it. When<br>
|
---|
429 | directory that contains a single HTML file representing the<br>
|
---|
430 | updating an existing collection, all plugins necessary to<br>
|
---|
431 | book, and separate files containing the associated images.<br>
|
---|
432 | process new material should already have been specified in<br>
|
---|
433 | An accompanying spreadsheet file contains the<br>
|
---|
434 | the configuration file.<br>
|
---|
435 | classification hierarchy; this is converted to a simple file<br>format (using Excelâs <i>Save As</i> command).<br>
|
---|
436 | <br>The building step creates the indexes for both searching<br>and browsing. The MG software is generally used to do the<br>
|
---|
437 | <br>Since the collection exists, its directory is already set up<br>
|
---|
438 | searching (Witten <i>et al.</i>, 1999), and the <i>mgbuild</i> module is<br>
|
---|
439 | with subdirectories <i>import</i>, <i>archives</i>, <i>building</i>, <i>index</i>, and<br>
|
---|
440 | automatically invoked to create each of the indexes that is<br>
|
---|
441 | <i>etc</i>, and the <i>etc</i> directory will contain a suitable collection<br>
|
---|
442 | required. For example, the Humanity Development Library<br>
|
---|
443 | configuration file.<br>
|
---|
444 | has three indexes, one for entire books, one for chapters,<br>and one for section titles. Subdirectories of the <i>index</i><br>
|
---|
445 | <br>
|
---|
446 | directory are created for each of these indexes.<br>
|
---|
447 | <b>The updating procedure</b><br>
|
---|
448 | <br>To update a collection, the new raw material is placed in<br>the <i>import</i> directory, in whatever form it is available. Then<br>
|
---|
449 | <hr>
|
---|
450 | </Content>
|
---|
451 | </Section>
|
---|
452 | <Section>
|
---|
453 | <Description>
|
---|
454 | <Metadata name="Title">6</Metadata>
|
---|
455 | </Description>
|
---|
456 | <Content><br />
|
---|
457 | creator<br>
|
---|
458 | [email protected]<br>
|
---|
459 | 1<br>
|
---|
460 | maintainer<br>
|
---|
461 | [email protected]<br>
|
---|
462 | 2<br>
|
---|
463 | public<br>
|
---|
464 | True<br>
|
---|
465 | 3<br>4<br>
|
---|
466 | indexes<br>
|
---|
467 | document:text<br>
|
---|
468 | 5<br>
|
---|
469 | defaultindex<br>
|
---|
470 | document:text<br>
|
---|
471 | 6<br>
|
---|
472 | plugins<br>
|
---|
473 | GMLPlug TEXTPlug ArcPlug RecPlug<br>
|
---|
474 | 7<br>8<br>
|
---|
475 | classify<br>
|
---|
476 | AZList metadata=Title<br>
|
---|
477 | 9<br>10<br>
|
---|
478 | collectionmeta<br>
|
---|
479 | collectionname &quot;generic text collection&quot;<br>
|
---|
480 | 11<br>
|
---|
481 | (a)<br>
|
---|
482 | collectionmeta<br>
|
---|
483 | .document:text &quot;documents&quot;<br>
|
---|
484 | 12<br>
|
---|
485 | creator<br>
|
---|
486 | [email protected]<br>
|
---|
487 | 1<br>
|
---|
488 | maintainer<br>
|
---|
489 | [email protected]<br>
|
---|
490 | 2<br>
|
---|
491 | public<br>
|
---|
492 | True<br>
|
---|
493 | 3<br>4<br>
|
---|
494 | indexes<br>
|
---|
495 | document:text document:From<br>
|
---|
496 | 5<br>
|
---|
497 | defaultindex<br>
|
---|
498 | document:text<br>
|
---|
499 | 6<br>
|
---|
500 | plugins<br>
|
---|
501 | GMLPlug EMAILPlug ArcPlug RecPlug<br>
|
---|
502 | 7<br>8<br>
|
---|
503 | classify<br>
|
---|
504 | AZList metadata=Title<br>
|
---|
505 | 9<br>
|
---|
506 | classify<br>
|
---|
507 | DateList<br>
|
---|
508 | 10<br>11<br>
|
---|
509 | collectionmeta<br>
|
---|
510 | collectionname &quot;Email messages&quot;<br>
|
---|
511 | 12<br>
|
---|
512 | collectionmeta<br>
|
---|
513 | .document:text &quot;documents&quot;<br>
|
---|
514 | 13<br>
|
---|
515 | collectionmeta<br>
|
---|
516 | .document:From &quot;email senders&quot;<br>
|
---|
517 | 14<br>15<br>
|
---|
518 | format<br>
|
---|
519 | QueryResults \\\\<br>
|
---|
520 | 16<br>
|
---|
521 | (b)<br>
|
---|
522 | &lt;td&gt;[link][icon][/link]&lt;/td&gt;&lt;td&gt;[Title]&lt;/td&gt;&lt;td&gt;[Author]&lt;/td&gt;<br>
|
---|
523 | 17<br>
|
---|
524 | <b>Figure 5: Collection configuration files (a) generic, (b) for an email collection</b><br>
|
---|
525 | <br>MG also compresses the text of the collection; and the<br>
|
---|
526 | certain circumstances, however, it might be preferable to<br>
|
---|
527 | image files are linked into the <i>index</i> subdirectory. Now<br>
|
---|
528 | use a standardized format such as XML. This is<br>
|
---|
529 | none of the material in the <i>import</i> and <i>archives</i> directories<br>
|
---|
530 | straightforward to implementjust write an XML<br>
|
---|
531 | is needed to run the collection and can be removed from<br>
|
---|
532 | pluginalthough we have not done so ourselves. Given<br>
|
---|
533 | the file system (though they would be needed if the<br>
|
---|
534 | the transitory nature of the imported data, to date, we have<br>
|
---|
535 | collection were rebuilt).<br>
|
---|
536 | found GML a satisfactory and beneficial format.<br>
|
---|
537 | <br>Associated with each collection is a database stored in<br>
|
---|
538 | <b>CREATING NEW COLLECTIONS</b><br>
|
---|
539 | GDBM (Gnu database manager) format. This contains an<br>entry for each document, giving its OID, its internal MG<br>
|
---|
540 | <br>Building new collections from scratch is only slightly<br>
|
---|
541 | document number, and metadata such as title. Information<br>
|
---|
542 | different from updating an existing collection. The key<br>
|
---|
543 | for each of the browsing indexes, which appear as buttons<br>
|
---|
544 | new requirement is creating a collection configuration file,<br>
|
---|
545 | on the Greenstone search/browse bar, is also extracted<br>
|
---|
546 | and a software utility is provided to help. Two pieces of<br>
|
---|
547 | during the building process and stored in the database. A<br>
|
---|
548 | information are required for this: the name of the directory<br>
|
---|
549 | âclassifierâ program is required for each browsing index to<br>
|
---|
550 | that the collection will use (into which the source data and<br>
|
---|
551 | extract the appropriate information from GML documents.<br>
|
---|
552 | other files will eventually be placed), and a contact e-mail<br>
|
---|
553 | Like plugins, classifiers are written on an <i>ad hoc</i> basis for<br>
|
---|
554 | address for use if any problems are encountered by the<br>
|
---|
555 | the particular information required, and where possible<br>
|
---|
556 | software once the collection is up and running. The utility<br>
|
---|
557 | reused from one collection to another.<br>
|
---|
558 | creates files and directories within the newly-named<br>
|
---|
559 | <br>
|
---|
560 | directory to support a generic collection of plain text<br>
|
---|
561 | The building program creates the indexes based on<br>
|
---|
562 | documents. With suitable data placed in the <i>import</i><br>
|
---|
563 | whatever appears in the <i>archives</i> directory. The first plugin<br>
|
---|
564 | directory, building the collection at this point will yield a<br>
|
---|
565 | specified by all collections is one that processes GML<br>
|
---|
566 | document-level searchable index of all the text and a<br>
|
---|
567 | files, and so if <i>archives</i> contains imported files they will be<br>
|
---|
568 | browsable list of âtitlesâ (defined in this case to be the<br>
|
---|
569 | processed correctly. If it contains material in the original<br>
|
---|
570 | document filenames).<br>
|
---|
571 | format, that will be converted using the appropriate plugin.<br>Thus the import process is optional.<br>
|
---|
572 | <br>To enhance the functionality and presentationâ something<br>
|
---|
573 | <br>
|
---|
574 | anything but the most trivial collection will requireâthe<br>
|
---|
575 | GML is designed to be fast and easy to parse, an important<br>
|
---|
576 | configuration file must be edited. For a collection sourced<br>
|
---|
577 | requirement when millions of documents are to be<br>
|
---|
578 | from documents in an already supported data format,<br>
|
---|
579 | processed. Something as simple as requiring tags to be<br>
|
---|
580 | presented in a similar fashion to an existing collection, the<br>
|
---|
581 | lower-case, for example, yields a substantial speed-up. In<br>
|
---|
582 | <hr>
|
---|
583 | </Content>
|
---|
584 | </Section>
|
---|
585 | <Section>
|
---|
586 | <Description>
|
---|
587 | <Metadata name="Title">7</Metadata>
|
---|
588 | </Description>
|
---|
589 | <Content><br />
|
---|
590 | <IMG src="_httpdocimg_/pdf01-7_1.jpg"><br>
|
---|
591 | <br>These are modules of code that can be slotted into the<br>system to enhance its capabilities. Plugins parse<br>documents, extracting the text and metadata to be indexed.<br>Classifiers control how metadata is brought together to<br>form browsable data structures. Both are specified in an<br>object-oriented framework using inheritance to minimize<br>the amount of code written.<br>
|
---|
592 | <br>A plugin must specify three things: what file formats it can<br>handle, how they should be parsed, and whether the plugin<br>is recursive. File formats are normally determined using<br>regular expression matching on the filename. For example,<br>the HTML plugin accepts all files that end in <i>.htm</i>, . <i>html</i>,<br><i>.HTM</i>, or <i>.HTML</i>. (It is quite possible, however, to write<br>plugins that âlook insideâ the file as well.) For other files,<br>the plugin returns <i>undefined</i> and the file is passed to the<br>next plugin in the collectionâs configuration file (e.g.<br>Figure 5 line 7). If it can, the plugin parses the file and<br>returns the number of documents processed. This involves<br>extracting text and metadata and adding it to the libraryâs<br>content through calls to <i>add text</i> and <i>add metadata</i>.<br>
|
---|
593 | <br>Some plugins (ârecursiveâ ones) add extra files into the<br>
|
---|
594 | <b>Figure 6: Searching bookmarked Web pages</b><br>
|
---|
595 | stream of data processed during the building phase by<br>artificially reactivating the list of plugins. This is how<br>directory hierarchies are traversed.<br>
|
---|
596 | amount of editing is minimal. Importing new data formats<br>and browsing metadata in ways not currently supported are<br>
|
---|
597 | <br>Plugins are small modules of code that are easy to write.<br>
|
---|
598 | more complex activities that require programming skills.<br>
|
---|
599 | We monitored the time it took to develop a new one that<br>was different to any we had produced so far. We chose to<br>make as an example a collection of HTML bookmark files,<br>
|
---|
600 | <br><b>Modifying the configuration file</b><br>
|
---|
601 | the motivation being to produce a convenient way of<br>
|
---|
602 | <br>
|
---|
603 | searching and browsing oneâs bookmarked Web pages.<br>
|
---|
604 | Figure 5b shows simple alterations to the generic<br>
|
---|
605 | Figure 6 shows a user searching for bookmarked pages<br>
|
---|
606 | configuration file in Figure 5a that was generated by the<br>
|
---|
607 | about <i>music</i>. The new plugin took under an hour to write,<br>
|
---|
608 | new-collection utility. <i>TEXTPlug</i> is replaced with<br>
|
---|
609 | and was 160 lines long (ignoring blank lines and<br>
|
---|
610 | <i>EMAILPlug</i> (line 7) which reads email files and extracts<br>
|
---|
611 | comments)âabout the average length of existing plugins.<br>
|
---|
612 | metadata (<i>From</i>, <i>To</i>, <i>Date</i>, <i>Subject</i>) from them. A classifier<br>for dates is added (line 10) to make the collection<br>
|
---|
613 | <br>Classifiers are more general than plugins because they<br>
|
---|
614 | browsable chronologically. The default presentation of<br>
|
---|
615 | work on GML-format data. For example, any plugin that<br>
|
---|
616 | search results is overridden (line 17) to display both the<br>
|
---|
617 | generates date metadata in accordance with the Dublin<br>
|
---|
618 | title of the message (i.e. Dublin Core <i>Title</i>) and its sender<br>
|
---|
619 | core can request the collection to be browsable<br>
|
---|
620 | (i.e. Dublin Core <i>Author</i>). Elements in square brackets,<br>
|
---|
621 | chronologically by specifying the <i>DateList</i> classifier in the<br>
|
---|
622 | such as <i>[Title]</i>, are replaced by the metadata associated<br>
|
---|
623 | collectionâs configuration file (Figure 7). Classifiers are<br>
|
---|
624 | with a particular document. The built-in term <i>[icon]</i><br>
|
---|
625 | more elaborate than most plugins, but new ones are seldom<br>
|
---|
626 | produces a suitable image that represents the document<br>
|
---|
627 | required. The average length of existing classifiers is 230<br>
|
---|
628 | (such as a book icon or page icon), and the <i>[link]âŠ[/link]</i><br>
|
---|
629 | lines.<br>
|
---|
630 | construct forms a hyperlink to the complete document.<br>
|
---|
631 | <br>
|
---|
632 | Anything else in the format statement, which in this case is<br>
|
---|
633 | Classifiers must specify three things: an initialization<br>
|
---|
634 | solely table-cell tags in HTML, is passed through to the<br>
|
---|
635 | routine, how individual documents are classified, and the<br>
|
---|
636 | page being displayed.<br>
|
---|
637 | final browsable data structure. Initialization takes care of<br>any options specified in the configuration file (such as<br>
|
---|
638 | As this example shows, creating a new collection that stays<br>
|
---|
639 | <i>metadata=Title </i>on line 9 of Figure 5b). Classifying<br>
|
---|
640 | within the bounds of the libraryâs established capabilities<br>
|
---|
641 | individual documents is an iterative process: for each one,<br>
|
---|
642 | falls within the capability of many computer usersâfor<br>
|
---|
643 | a call to <i>document-classify</i> is made. On presentation of the<br>
|
---|
644 | instance, computer-trained librarians. Extending<br>
|
---|
645 | documentâs OID, the necessary metadata is located and<br>
|
---|
646 | Greenstone to handle new document formats and browse<br>
|
---|
647 | used to control where the document is added to the<br>
|
---|
648 | metadata in new ways is more challenging.<br>
|
---|
649 | browsable data structure being constructed.<br>
|
---|
650 | <br>Once all documents have been added, a request is made for<br>
|
---|
651 | <br><b>Writing new plugins and classifiers</b><br>
|
---|
652 | the completed data structure. Some classifiers return the<br>data structure directly; others transform the data structure<br>
|
---|
653 | <br>Extensibility is obtained through plugins and classifiers.<br>
|
---|
654 | before it is returned. For example, the <i>AZList</i> classifier<br>
|
---|
655 | <hr>
|
---|
656 | </Content>
|
---|
657 | </Section>
|
---|
658 | <Section>
|
---|
659 | <Description>
|
---|
660 | <Metadata name="Title">8</Metadata>
|
---|
661 | </Description>
|
---|
662 | <Content><br />
|
---|
663 | <IMG src="_httpdocimg_/pdf01-8_1.jpg"><br>
|
---|
664 | a page number, next and previous page buttons, and<br>displaying a particular page at different resolutions. A text<br>version of the page is also available upon which a<br>searching option is also provided.<br>
|
---|
665 | Started in 1994, Harvest is also a long-running research<br>project. It provides an efficient means of gathering source<br>data from the Internet and distributing indexing<br>information over the Internet. This is accomplished<br>through five components: <i>gatherer</i>, <i>broker</i>, <i>indexer</i>,<br><i>replicator</i> and <i>cache</i>. The first three are central to creating,<br>updating and searching a collection; the last two help to<br>improve performance over the Internet through transparent<br>mirroring and caching techniques.<br>
|
---|
666 | The system is configurable and customizable. While<br>searching is most commonly implemented using Glimpse<br>(<i>glimpse.cs.arizona.edu</i>), in principle any search engine<br>that supports incremental updates and Boolean<br>combinations of attribute-based queries can be used. It is<br>possible to control what type of documents are gathered<br>during creation and updating, and how the query interface<br>
|
---|
667 | <b>Figure 7: Browsing a newspaper collection by date</b><br>
|
---|
668 | looks and is laid out.<br>
|
---|
669 | Sample collections cited by the developers include 21,000<br>
|
---|
670 | divides the alphabetically sorted list of metadata into<br>
|
---|
671 | computer science technical reports and 7,000 home pages.<br>
|
---|
672 | separate pages of about the same size and returns the<br>
|
---|
673 | Other examples include a sizable collection of agriculture-<br>
|
---|
674 | alphabetic ranges for each one (Figure 4).<br>
|
---|
675 | related electronic journals and magazines called âtomato-<br>juiceâ (accessed through <i>hegel.lib.ncsu.edu</i>) and a full-text<br>
|
---|
676 | <b>OVERVIEW OF RELATED WORK</b><br>
|
---|
677 | index of library-related electronic serials<br>
|
---|
678 | Two projects that provide substantial open source digital<br>
|
---|
679 | (<i>sunsite.berkeley.edu/IndexMorganagus</i>). Harvest is also<br>
|
---|
680 | library software are Dienst (Lagoze and Fielding, 1998)<br>
|
---|
681 | often used to index Web sites (for example<br>
|
---|
682 | and Harvest (Bowman <i>et al.</i>, 1994). The origins of Dienst<br>
|
---|
683 | <i>www.middlebury.edu</i>).<br>
|
---|
684 | (<i>www.cs.cornell.edu/cdlrg</i>) stretch back to 1992. The term<br>
|
---|
685 | Comparing Greenstone with Dienst and Harvest, there are<br>
|
---|
686 | has come to represent three entities: a conceptual<br>
|
---|
687 | both similarities and differences. All provide substantial<br>
|
---|
688 | architecture for distributed digital libraries; an open<br>
|
---|
689 | digital library systems, hence common themes recur, but<br>
|
---|
690 | protocol for service communication; and a software<br>
|
---|
691 | they are driven by projects with different aims. Harvest,<br>
|
---|
692 | system that implements the protocol. To date, five sample<br>
|
---|
693 | for instance, was not conceived as a digital library project<br>
|
---|
694 | digital libraries have been built using this technology.<br>
|
---|
695 | at all, but by virtue of its selective document gathering<br>
|
---|
696 | They manifest themselves in two forms: technical reports<br>
|
---|
697 | process it can be classed (and is used) as one. While it<br>
|
---|
698 | and primary source documents.<br>
|
---|
699 | provides sophisticated search options, it lacks the<br>
|
---|
700 | Best known is NCSTRL, the Networked Computer<br>
|
---|
701 | complementary service of browsing. Furthermore it adds<br>
|
---|
702 | Science Technical Reference Library project<br>
|
---|
703 | no structure or order to the documents collected, relying<br>
|
---|
704 | (<i>www.ncstrl.org</i>). This collection facilitates searching by<br>
|
---|
705 | on whatever structures are present in the site that they<br>
|
---|
706 | title, author and abstract, and browsing by year and author,<br>
|
---|
707 | were gathered from. A proven strength of the design is its<br>
|
---|
708 | across a distributed network of document repositories.<br>
|
---|
709 | flexibility through configuration and customizationan<br>
|
---|
710 | Documents can (where supported) be delivered in various<br>
|
---|
711 | element also present in Greenstone.<br>
|
---|
712 | formats such as PostScript, a thumbnail overview of the<br>
|
---|
713 | Dienstbest exemplified through the NCSTRL<br>
|
---|
714 | pages, and a GIF image of a particular page.<br>
|
---|
715 | worksupports searching and browsing, like Greenstone.<br>
|
---|
716 | The <i>Making of America</i> resource is an example of a<br>
|
---|
717 | Both use open protocols. Differences include a high<br>
|
---|
718 | collection based around primary sourcesin this case<br>
|
---|
719 | reliance in Dienst on user-supplied information when a<br>
|
---|
720 | American social history, 1830â1900. It has a different<br>
|
---|
721 | document is added, and a smaller range of document types<br>
|
---|
722 | âlook and feelâ to NCSTRL, being strongly oriented<br>
|
---|
723 | supportedâalthough Dienst does include a document<br>
|
---|
724 | toward browsing rather than searching. A user navigates<br>
|
---|
725 | model that should, over time, allow this to expand with<br>
|
---|
726 | their way through a hierarchical structure of hyperlinks to<br>
|
---|
727 | relative ease.<br>
|
---|
728 | reach a book of interest. The book itself is a series of<br>
|
---|
729 | There are also commercial systems that provide similar<br>
|
---|
730 | scanned images: delivery options include going directly to<br>
|
---|
731 | digital library services to those described. However, since<br>
|
---|
732 | <hr>
|
---|
733 | </Content>
|
---|
734 | </Section>
|
---|
735 | <Section>
|
---|
736 | <Description>
|
---|
737 | <Metadata name="Title">9</Metadata>
|
---|
738 | </Description>
|
---|
739 | <Content><br />
|
---|
740 | corporate culture instills proprietary attitudes there is little<br>
|
---|
741 | <b>REFERENCES</b><br>
|
---|
742 | opportunity for advancement through a shared<br>
|
---|
743 | 1. Akscyn, R.M. and Witten, I.H. (1998) âReport on First<br>
|
---|
744 | collaborative effort. Consequently they are not reviewed<br>
|
---|
745 | Summit on International Cooperation on Digital<br>
|
---|
746 | here.<br>
|
---|
747 | Libraries.â ks.com/idla-wp-oct98.<br>
|
---|
748 | 2. Bowman, C.M., Danzig, P.B., Manber, U., and<br>
|
---|
749 | <b>CONCLUSIONS</b><br>
|
---|
750 | Schwartz, M.F. âScalable Internet resource discovery:<br>
|
---|
751 | Greenstone is a comprehensive software system for<br>
|
---|
752 | Research problems and approachesâ <i>Communications</i><br>
|
---|
753 | creating digital library collections. It builds data structures<br>
|
---|
754 | <i>of the ACM,</i> Vol. 37, No. 8, pp. 98â107, 1994.<br>
|
---|
755 | for searching and browsing from the material provided,<br>
|
---|
756 | 3. Fox, E. (1998) âDigital library definitions.â<br>
|
---|
757 | rather than relying on any hand-crafting. The process is<br>
|
---|
758 | ei.cs.vt.edu/~fox/dlib/def.html.<br>
|
---|
759 | controlled by a configuration file, and once a collection<br>exists new material can be added completely<br>
|
---|
760 | 4. Humanity Libraries (1998) <i>Humanity Development</i><br>
|
---|
761 | automatically. Browsing is based on Dublin Core<br>
|
---|
762 | <i>Library</i>. CD-ROM produced by the Global Help<br>
|
---|
763 | metadata.<br>
|
---|
764 | Project, Antwerp, Belgium.<br>
|
---|
765 | New collections can be developed easily, particularly if<br>
|
---|
766 | 5. Lagoze, C. and Fielding, D âDefining Collections in<br>
|
---|
767 | they resemble existing ones. Extensibility is achieved<br>
|
---|
768 | Distributed Digital Librariesâ <i>D-Lib Magazine</i>, Nov.<br>
|
---|
769 | through software âpluginsâ that can be written to<br>
|
---|
770 | 1998.<br>
|
---|
771 | accommodate documents, and metadata, in different<br>
|
---|
772 | 6. PAHO (1999) <i>Virtual Disaster Library</i>. CD-ROM<br>
|
---|
773 | formats. Standard plugins exist for many document types;<br>
|
---|
774 | produced by the Pan-American Health Organization,<br>
|
---|
775 | new ones are easily written. Browsing is controlled by<br>
|
---|
776 | Washington DC, USA.<br>
|
---|
777 | âclassifiersâ that process metadata into browsing structures<br>
|
---|
778 | 7. McNab, R.J., Witten, I.H. and Boddie, S.J. (1998) âA<br>
|
---|
779 | (by date, alphabetical, hierarchical, etc).<br>
|
---|
780 | distributed digital library architecture incorporating<br>
|
---|
781 | However, the most powerful support for extensibility is<br>
|
---|
782 | different index styles.â <i>Proc IEEE Advances in Digital</i><br>
|
---|
783 | achieved not by technical means but by making the source<br>
|
---|
784 | <i>Libraries</i>, Santa Barbara, CA, pp. 36â45.<br>
|
---|
785 | code freely available under the Gnu public license. Only<br>
|
---|
786 | 8. Nevill-Manning, C.G., Reed, T., and Witten, I.H.<br>
|
---|
787 | through an international cooperative effort will digital<br>
|
---|
788 | (1998) âExtracting text from PostScriptâ<br>
|
---|
789 | library software become sufficiently comprehensive to<br>
|
---|
790 | <i>SoftwareâPractice and Experience</i>, Vol. 28, No. 5, pp.<br>
|
---|
791 | meet the worldâs needs with the richness and flexibility<br>
|
---|
792 | 481â491; April.<br>
|
---|
793 | that users deserve.<br>
|
---|
794 | 9. UNESCO (1999) <i>SAHEL point DOC: Anthologie du</i><br>
|
---|
795 | <b>ACKNOWLEDGMENTS</b><br>
|
---|
796 | <i>développement au Sahel</i>. CD-ROM produced by<br>UNESCO, Paris, France.<br>
|
---|
797 | We gratefully acknowledge all those who have worked on<br>the Greenstone software, and all members of the New<br>
|
---|
798 | 10. UNU (1998) <i>Collection on critical global issues.</i> CD-<br>
|
---|
799 | Zealand Digital Library project for their enthusiasm and<br>
|
---|
800 | ROM produced by the United Nations University<br>
|
---|
801 | ideas.<br>
|
---|
802 | Press, Tokyo, Japan.<br>
|
---|
803 | 11. Witten, I.H., Moffat, A. and Bell, T. (1999) <i>Managing</i><br>
|
---|
804 | <i>Gigabytes: compressing and indexing documents and<br>images</i>, Morgan Kaufmann, second edition.<br>
|
---|
805 | <hr>
|
---|
806 |
|
---|
807 |
|
---|
808 | </Content>
|
---|
809 | </Section>
|
---|
810 | </Section>
|
---|
811 | </Archive>
|
---|