# Note: these pages are not translated package Global _textimagefaq_ {FAQ} ## "faq" ## nav_bar_button ## tfaq ## _httpicontfaqof_ {_httpimg_/tfaqof.gif} _httpicontfaqon_ {_httpimg_/tfaqon.gif} package faq _faqlink_ {
  • _3_
  • } _faqdef_ {

    _2_
    _3_

    } _faqmainlink_ {

    _t207_

    } _httppagefaqgen_ {_httppagex_(faqgen)} _faqgenlinks_ { _faqlink_(_1_,1,_t171_) _faqlink_(_1_,2,_t172_) _faqlink_(_1_,3,_t173_) _faqlink_(_1_,4,_t174_) _faqlink_(_1_,5,_t175_) _faqlink_(_1_,6,_t176_) } _httppagefaqob_ {_httppagex_(faqob)} _faqoblinks_ { _faqlink_(_1_,7,_t178_) _faqlink_(_1_,8,_t179_) _faqlink_(_1_,9,_t180_) _faqlink_(_1_,10,_t181_) } _httppagefaqinst_ {_httppagex_(faqinst)} _faqinstlinks_ { _faqlink_(_1_,11,_t183_) _faqlink_(_1_,12,_t184_) _faqlink_(_1_,gliapplet,_tfaqgliapplettitle_) } _httppagefaqrun_ {_httppagex_(faqrun)} _faqrunlinks_ { _faqlink_(_1_,13,_t186_) _faqlink_(_1_,14,_t187_) _faqlink_(_1_,15,_t188_) _faqlink_(_1_,16,_t189_) _faqlink_(_1_,17,_t190_) _faqlink_(_1_,18,_t191_) _faqlink_(_1_,19,_t192_) _faqlink_(_1_,20,_t193_) _faqlink_(_1_,21,_t194_) } _httppagefaqbuild_ {_httppagex_(faqbuild)} _faqbuildlinks_ { _faqlink_(_1_,gli,_tfaqbuildglititle_) _faqlink_(_1_,22,_t196_) _faqlink_(_1_,23,_t197_) _faqlink_(_1_,24,_t198_) _faqlink_(_1_,25,_t199_) _faqlink_(_1_,26,_t200_) _faqlink_(_1_,27,_t201_) _faqlink_(_1_,28,_t202_) _faqlink_(_1_,29,_t203_)
  • What options are available for the collect.cfg file?
  • _faqlink_(_1_,30,_t204_) _faqlink_(_1_,31,_t205_) _faqlink_(_1_,32,_tfaqbuild11title_) _faqlink_(_1_,33, _tfaqbuildexpattitle_) _faqlink_(_1_,sizelimit,_tfaqbuildsizelimittitle_) } _httppagefaqplugins_ {_httppagex_(faqplugins)} _faqpluginlinks_ { _faqlink_(_1_,plugins0,_tfaqplugins0title_) _faqlink_(_1_,plugins1,_tfaqplugins1title_) _faqlink_(_1_,plugins2,_tfaqplugins2title_) _faqlink_(_1_,plugins3,_tfaqplugins3title_) } _httppagefaqcustomize_ {_httppagex_(faqcustomize)} _faqcustomizelinks_ { _faqlink_(_1_,customizefrontpagelogo,_tfaqcustomizefrontpagelogotitle_) _faqlink_(_1_,customizemoreinfo,_tfaqcustomizemoreinfotitle_) _faqlink_(_1_,customizeformat, _tfaqcustomizeformattitle_) _faqlink_(_1_,customizemetadatalinking, _tfaqcustomizemetadatalinkingtitle_) _faqlink_(_1_,customizenewpage, _tfaqcustomizenewpagetitle_) _faqlink_(_1_,customizenotext, _tfaqcustomizenotexttitle_) } _content_ { _pageheading_(_faq:greenstonefaq_)

    _t372_

    } _content_ [faqfull=1] { _pageheading_(_faq:greenstonefaq_)

    _t373_

    _faqgen:faqdefs_

     

    _faqob:faqdefs_

     

    _faqinst:faqdefs_

     

    _faqrun:faqdefs_

     

    _faqbuild:faqdefs_

     

    _faqplugins:faqdefs_

     

    _faqcustomize:faqdefs_ } _greenstonefaq_ {Greenstone FAQ} _headinggeneral_ {General Information} _t171_ {What is Greenstone?} _t172_ {How is Greenstone licensed?} _t173_ {What platforms will Greenstone run on?} _t174_ {Are there any mailing lists concerned with Greenstone?} _t175_ {Are the mailing lists archived anywhere?} _t176_ {How do I contribute to Greenstone?} _headingobtaining_ {Obtaining Greenstone} _t178_ {Where do I get Greenstone from?} _t179_ {Are there binary distributions of Greenstone available?} _t180_ {Is Greenstone available on CD-ROM?} _t181_ {Is the Greenstone source code available via CVS?} _headinginstalling_ {Installing Greenstone} _t183_ {How do I compile Greenstone from a source or CVS distribution?} _t184_ {What is the difference between Greenstone's local library and web library?} _tfaqgliapplettitle_ {How do I install the Greenstone Librarian Interface as an applet?} _headingrunning_ {Running Greenstone} _t186_ {OK, I've installed Greenstone. Now how do I make it go?} _t187_ {What web browser do I need to view Greenstone collections?} _t188_ {When I start the Windows local library there are two buttons in the dialog box, "Enter Library" and "Restricted Version". They both seem to do the same thing, what's the difference?} _t189_ {So when should I use the "Restricted Version" of the local library?} _t190_ {When I start the Windows local library my computer asks me to dial up my Internet Service Provider. Do I really need to be online to run Greenstone?} _t191_ {I'm trying to use the Windows local library. My web browser is starting up as expected but the Greenstone home page never gets loaded or gives an error message. What's wrong?} _t192_ {Where can I get more Greenstone collections?} _t193_ {When I attempt to access certain parts of Greenstone I'm asked for a username and password. What do I enter?} _t194_ {When I use the large query box function I occassionally get a Not Found error.} _headingbuilding_ {Building Greenstone Collections} _tfaqbuildglititle_ {What is the "Greenstone Librarian Interface"?} _t196_ {What is "the Collector"?} _t197_ {How do I build a collection from the command line or DOS prompt?} _t198_ {I built a new Greenstone collection on my Windows machine. Everything appeared to work fine while building, however when I tried to view the collection some of the documents contained no text. Sometimes Greenstone appeared to crash completely. What have I done wrong?} _t199_ {Why won't the Collector's "export to CD-ROM" function work?} _t200_ {I'm trying to use the Collector on Windows 2000 but it's running extremely slowly. Is this normal?} _t201_ {What is "the Organizer"?} _t202_ {Where do I get the Organizer?} _t203_ {I'm attempting to build a collection with the collector but it keeps failing with an error. What am I doing wrong?} _t204_ {Where can I find some example collect.cfg configuration files?} _t205_ {How can I build my collection using MGPP?} _tfaqbuild11title_ {I've added a new type of classification to my collection. How do I create and add the navigation bar images?} _tfaqbuildexpattitle_ {How do I fix XML::Parser errors during import.pl?} _tfaqbuildsizelimittitle_ {Are there any limits to the size of collections?} _headingplugins_ {More About Plugins} _tfaqplugins0title_ {Does Greenstone have a plugin for my data format?} _tfaqplugins1title_ {What metadata is available for each plugin?} _tfaqplugins2title_ {I'm having problems with my PDF files! What's wrong?} _tfaqplugins3title_ {How do I use UnknownPlug to handle my new format?} _headingcustomize_ {Customizing Your Greenstone Library} _tfaqcustomizefrontpagelogotitle_ {How do I change the logo on the front page of my library ("greenstone digital library software")?} _tfaqcustomizemoreinfotitle_ {Where can I get more information about customizing my Greenstone library?} _tfaqcustomizeformattitle_ {What are the formatting options available for my collection?} _tfaqcustomizemetadatalinkingtitle_ {How can I hyperlink individual metadata elements?} _tfaqcustomizenewpagetitle_ {How can I add a new page to my Greenstone library?} _tfaqcustomizenotexttitle_ {How can I hide the dummy text "This document has no text"?} _t207_ {FAQ Main Page} _t372_ {Show entire FAQ on a single page} _t373_ {Show FAQ on multiple pages} ####################################################################### package faqgen _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headinggeneral_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(1,_t171_,_faqgen:t208_) _faq:faqdef_(2,_t172_,_faqgen:t209_) _faq:faqdef_(3,_t173_,_faqgen:t210_) _faq:faqdef_(4,_t174_,_faqgen:t211_) _faq:faqdef_(5,_t175_,_faqgen:t212_) _faq:faqdef_(6,_t176_,_faqgen:t213_) } _t208_ { Greenstone is a suite of software which has the ability to serve digital library collections and build new collections. It provides a new way of organizing information and publishing it on the Internet or on CD-ROM. For more information, read the Greenstone Fact Sheet.} _t209_ { Greenstone is open-source software, distributed under the terms of the GNU General Public License. } _t210_ { Greenstone has been tested on Windows 3.1/3.11/95/98/Me/NT/2000, most distributions of GNU/Linux, Darwin (Mac OS X), Solaris, and FreeBSD. It should in fact work on any Windows or Unix system. If you use a system other than those mentioned and you find Greenstone doesn't run, please contact us.

    Please note that the downloadable Windows distribution of Greenstone comes with an installer that will not work on 16 bit Windows. If you need to use Greenstone on Windows 3.1/3.11 please contact us.

    } _t211_ { There are two Greenstone mailing lists. You can subscribe to them from the documentation page. } _t212_ { The most popular mailing list (greenstone-users@list.scms.waikato.ac.nz) is archived as a Greenstone collection at www.nzdl.org. Note that this collection is updated only sporadically so may not always be completely up to date. } _t213_ { We welcome contributions or improvements to the Greenstone software!
    Before you send in any contribution, you first need to make sure that your changes are compatible with the latest snapshop of the Greenstone source code. To get the latest code you'll need to use CVS (see here for details).
    You should then send the modified files, along with details of the modifications you've made, to greenstone@cs.waikato.ac.nz.

    Before beginning work, you should announce what you're doing on the greenstone developer's list to tell us what you plan to do and get some feedback.

    } ####################################################################### package faqob _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headingobtaining_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(7,_t178_,_faqob:t215_) _faq:faqdef_(8,_t179_,_faqob:t216_) _faq:faqdef_(9,_t180_,_faqob:t217_) _faq:faqdef_(10,_t181_,_faqob:t218_) } _t215_ { From the greenstone.org download page. } _t216_ { Yes. At present there are binary distributions for 32 bit Windows, PowerPC Mac OS X, and i386 linux. They can be downloaded from the download page. } _t217_ { While some version 2.37 and 2.38 CD-ROMs have been produced they're not currently being made widely available. You are encouraged to download the latest release of Greenstone from the download page. If your internet connection is such that downloading Greenstone isn't possible please contact us and we may be able to arrange for a CD-ROM to be sent out. } _t218_ { Yes, see our CVS page for details. } ####################################################################### package faqinst _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headinginstalling_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(11,_t183_,_faqinst:t220_) _faq:faqdef_(12,_t184_,_faqinst:t221_) _faq:faqdef_(gliapplet, _tfaqgliapplettitle_, _faqinst:tfaqgliapplet_) } _t220_ { See our compiling page. } _t221_ { Firstly, the local library is only available if you're running Greenstone under Windows. It's not yet available on Unix.

    The major difference between the two is that the local library contains it's own built-in webserver. The web library however, requires an external webserver like Apache or Microsoft IIS. This makes the local library much easier to install and configure than the web library.

    For this reason, it's recommended that Windows users install the local library unless they're sure that they need the web library. Even if you think you might need the web library, try installing the local library first. You can always uninstall it later and install the web library if you then decide you need it.

    A situation where the web library may be preferable is if you plan to serve your Greenstone collections as a full-time service on the web. In this case you'll probably want the added stability that running the web library in conjunction with an external webserver can provide.

    Please note that the local library is quite capable of serving Greenstone collections over a local area network or the web (despite its rather misleading name).

    } _tfaqgliapplet_ {

    To get the GLI applet running, please do the following:

    1. Install Greenstone and the GLI on your server computer. Currently, this must be a GNU/Linux or Unix machine.
    2. Set up your web server (eg. Apache) for Greenstone. Check that standard Greenstone works.
    3. Set the JAVA_HOME environment variable in your web server. E.g. for Apache, add the directive SetEnv JAVA_HOME path-to-java to the httpd.conf file. (This requires module mod_env.)
    4. Make the Greenstone "collect" directory world-writeable.
    5. The following four steps require the Java SDK. If you don't already have this it is available for download from http://java.sun.com/j2se/1.4.2/download.html.
    6. In the gsdl/gli directory, run
      keytool -genkey -alias privateKey -keystore appletstore -storepass greenstone
      Enter the appropriate details for your organization. When it asks to enter the key password for <privateKey>, choose your own password or hit Enter to use "greenstone".
    7. Run makegli.sh.
    8. Run makejar.sh.
    9. Run
      jarsigner -keystore appletstore -signedjar SignedGatherer.jar GLI.jar privateKey
      When it prompts, enter the password you used above.
    10. Move the SignedGatherer.jar file created into the gsdl/bin/java directory.
    11. Edit the gsdl/etc/main.cfg file and set the "gliapplet" field to "enabled".
    12. Visit your Greenstone homepage and click "The Librarian Interface" button. The applet should load and appear on this page, producing a button that says "Launch Greenstone Librarian Interface...". Clicking this will run the GLI as an applet, allowing users to build collections on your server without having Greenstone installed on their machines.

    Note that the applet transfers a lot of data between the machine it is running on and the server. This makes using the GLI applet impractical if you don't have a high speed connection between your machine and the server.

    } ####################################################################### package faqrun _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headingrunning_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(13,_t186_,_faqrun:t223_) _faq:faqdef_(14,_t187_,_faqrun:t224_) _faq:faqdef_(15,_t188_,_faqrun:t225_) _faq:faqdef_(16,_t189_,_faqrun:t226_) _faq:faqdef_(17,_t190_,_faqrun:t227_) _faq:faqdef_(18,_t191_,_faqrun:t228_) _faq:faqdef_(19,_t192_,_faqrun:t229_) _faq:faqdef_(20,_t193_,_faqrun:t230_) _faq:faqdef_(21,_t194_,_faqrun:t231_) } _t223_ { If you're using the Windows local library you should be able to simply select "Greenstone Digital Library" from within the programs in your start menu.

    If you're using the web library things are a little less obvious however. First make sure your webserver is configured correctly and is running (see the Greenstone Installer's Guide and your webserver's documentation for details). You can then simply open your web browser and point it at the URL of Greenstone's library executable. This is dependant on the way you configured Greenstone and your webserver. Typically it might be something like http://localhost/gsdl/cgi-bin/library.exe.

    } _t224_ { Greenstone relies on a web browser that supports tables, javascript, and in some places, frames. Any reasonably modern browser will do. Examples are Microsoft Internet Explorer 4, Netscape 4, and Mozilla. Newer releases of all these browsers will also work.

    If you find that your favourite web browser does not work with Greenstone, please contact us.

    Note that there is an exception to the rule that any modern browser will do when running Greenstone. That is when you're using the restricted version of the Windows local library when you must use Netscape. See the discussion below on the differences between the "Restricted Version" and the standard "Enter Library" version of the local library for details.

    } _t225_ { The webserver built into the local library uses the networking software built into your Windows operating system in order to function. If your computer has never been connected to a network this networking software may not be installed however. For this reason Greenstone comes with some networking software of it's own that it will use if it can't find any installed on your computer.

    When you click the "Enter Library" button, Greenstone first checks to see if your computer has it's own networking software. If it does, it starts up using that, if not it starts up using it's own networking software.

    When you click the "Restricted Version" button, Greenstone doesn't bother checking your system for networking software, it just goes ahead and uses it's own.

    The catch is that there are several limitations with using the Greenstone supplied networking software. The most important limitations are that the local library won't be accessible from the network if run in this way (that is, it really will be "local" to the machine on which it's running) and that it must use a Netscape web browser. Using your computer's built-in networking software is therefore the prefered option.

    } _t226_ { Since Greenstone will automatically use it's own networking software if it can't find any installed on your computer it should not normally be necessary to run the "Restricted Version" explicitly.

    Times when it may be necessary are.

    } _t227_ { No you don't need to be online. This is caused by the webserver built into Greenstone's local library sending a message to your computer's networking software to make sure it's functioning correctly. On many Windows systems this causes the familiar dial up dialog box to appear. In most situations you can simply cancel the dialog box and (if required) press your browser's reload button to continue.

    If this does not solve the problem, try starting the local library by clicking the "Restricted Version" button rather than the "Enter Library" button. See the discussion above on the differences between the standard and restricted versions of the local library for further details.

    } _t228_ {
    1. Check your web browser's internet proxy settings and turn proxies off (use Edit preferences on Netscape or Internet options on Explorer).
    2. If Internet Explorer gives a message saying "The page cannot be displayed" and "Cannot find server or DNS error" at the bottom of the page, check in your network settings that your computer's name is set up correctly. For example, if there is a DNS suffix entered in your TCP/IP properties (in the Control Panel), make sure that your host name and suffix are correct for your computer. If the server is running correctly, you should be able to connect by visiting http://127.0.0.1/ in a web browser on the same machine that the local library is running on.
    } _t229_ { Collections like those at www.nzdl.org will soon be made available for download. } _t230_ { The initial username required here is admin.

    If you installed Greenstone using the InstallShield installer on Windows or the Install.sh script on Unix you should have been asked to set a password during the installation procedure.

    If you didn't, don't worry, the password defaults to being admin.

    So if you don't know what to enter you should try username = admin, password = admin.

    } _t231_ { This may be caused by the URL becoming too long for your web browser. Because Greenstone currently stores all state information in the URL, if you do a search for a long phrase the URL can become very long. Different browser's on different platforms have different maximum URL lengths but in general it seems that Netscape can handle longer URLs than can Microsoft Internet Explorer.

    There is very little you can do to avoid this problem with the way Greenstone is currently implemented (aside from not searching for long phrases). Future versions of Greenstone may store some state information on the server rather than in the URL but this has yet to be implemented.

    } ####################################################################### package faqbuild _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headingbuilding_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(gli, _tfaqbuildglititle_,_faqbuild:tfaqbuildglibody_) _faq:faqdef_(22,_t196_,_faqbuild:t233_) _faq:faqdef_(23,_t197_,_faqbuild:t234_) _faq:faqdef_(24,_t198_,_faqbuild:t235_) _faq:faqdef_(25,_t199_,_faqbuild:t236_) _faq:faqdef_(26,_t200_,_faqbuild:t237_) _faq:faqdef_(27,_t201_,_faqbuild:t238_) _faq:faqdef_(28,_t202_,_faqbuild:t239_) _faq:faqdef_(29,_t203_,_faqbuild:t240_) _faq:faqdef_(30,_t204_,_faqbuild:t241_) _faq:faqdef_(31,_t205_,_faqbuild:t242_) _faq:faqdef_(32, _tfaqbuild11title_,_faqbuild:tfaqbuild11body_) _faq:faqdef_(33, _tfaqbuildexpattitle_, _faqbuild:tfaqbuildexpatbody_) _faq:faqdef_(sizelimit, _tfaqbuildsizelimittitle_, _faqbuild:tfaqbuildsizelimitbody_) } _tfaqbuildglibody_ { The Greenstone Librarian Interface (GLI) is a graphical tool for building new collections, altering or deleting existing collections, and exporting existing collections to stand-alone CD-ROMs. It allows you to import or assign metadata, and has an interactive collection design module. Launch the GLI under Windows by selecting Greenstone Digital Library from the Programs section of the Start menu and choosing Librarian Interface. Under Linux, run gli.sh from the gsdl/gli directory. For details on using the Librarian Interface see the Greenstone User's Guide. } _t233_ { The Collector is a web interface for collection building, altering and exporting. It predates the Librarian Interface and for most practical purposes, the Librarian Interface should be used instead. To begin using the Collector, click the "The Collector" button on your Greenstone home page. For further details on using the Collector see the Greenstone User's Guide. } _t234_ { It's occasionally preferable to build your Greenstone collections from the command line rather than from the Collector. This allows you greater control over how your new collection turns out. See the Greenstone Developer's Guide for detailed step by step instructions on building collections from the command line. } _t235_ { Are you running Norton Anti-Virus? There are some incompatibilities between Norton and the Greenstone collection building process that cause unpredictable things to happen if you build your collection while Norton is running. Try disabling Norton and rebuilding the collection.

    If you do not have Norton or disabling Norton does not solve the problem please contact us for further help.

    } _t236_ { If you downloaded Greenstone from the web you will not have all the components required to make the "export to CD-ROM" function work. These extra components have been made available in a separate download which you can get from the download page. } _t237_ { Are you using a Netscape web browser with the local library? If so, try using Internet Explorer instead. There are some socket connection problems that show up on Windows 2000 when using Netscape. } _t238_ { The Organizer (also called the "Collection Organizer") is a Windows utility used for automatically generating some of the configuration files (metadata.xml, sub.txt etc.) used by complex Greenstone collections. } _t239_ { From the download page. } _t240_ { There are several reasons that the collector might fail to build a collection and the error messages it produces are not always very helpful.

    If you changed the default configuration during the configure collection stage you'll need to make sure the changes were valid. For example, if you added a new classify or plugin line you'll need to make sure that the classifier and/or plugin names and arguments are all correct. If they're not the collector will fail. A good test is to build your collection without changing the configuration. If it builds ok with the default configuration but fails after you change the configuration you'll need to look closely at the changes you're making.

    Another good thing to do if having problems with the collector is to build your collection from the command line instead. You'll get much more feedback to help debug problems when building in this way. For details on how to build a collection from the command line see the Greenstone developer's guide.

    } _t241_ { The collect.cfg files for many of the collections at www.nzdl.org have been made available here. } _t242_ { The MGPP user manual gives some instructions. } _tfaqbuild11body_ { Visit this page and follow the instructions. } _tfaqbuildexpatbody_ { Our prebuilt Linux and Mac OS X Greenstone distributions are built on machines using Perl 5.6, and these distributions contain a few binary perl modules. These cause problems if you are using a recent version of perl like 5.8 or 5.8.1 (you can type "perl -v" from the command line to see the version).

    On the Mac, our distribution contains modules for both perl 5.6 and 5.8 and the correct one should (hopefully) be installed.

    A typical error message during import.pl would be:

    Uncaught exception from user code: Can't load '/home/httpd/gsdl/perllib/cpan/auto/XML/Parser/Expat/Expat.so' for module XML::Parser::Expat: /home/httpd/gsdl/perllib/cpan/auto/XML/Parser/Expat/Expat.so: undefined symbol: PL_sv_undef at /usr/lib/perl5/5.8.0/i386-linux-thread-multi/DynaLoader.pm line 229. at /home/httpd/gsdl/perllib/cpan/XML/Parser.pm line 14

    To remedy this, you need to remove the "gsdl/perllib/cpan/XML" and "gsdl/perllib/cpan/auto" directories. (For version 2.52, remove gsdl/perllib/cpan/perl-5.8/XML and gsdl/perllib/cpan/perl-5.8/auto.) Then you need to install the perl XML::Parser natively for your system.

    On redhat or mandrake, install the .rpm named "perl-XML-Parser", on debian, install the "libxml-parser-perl" package. For other Linuxes, use your distribution's package, or you can get it from http://search.cpan.org/~msergeant/XML-Parser-2.34/.

    You may also need to get Expat, available from http://sourceforge.net/projects/expat/. } _tfaqbuildsizelimitbody_ { The largest collections we have built have been 7 Gb of text, and 11 million short documents (about 3 Gb text). These built with no problems. We haven't tried larger amounts of text because we don't have larger amounts of text lying around. It's no good using 7 Gb twice over to make 14 Gb because the vocabulary hasn't grown accordingly, as it would with a real collection.

    There are three main limitations:

    1. There is a file size limit of 2 Gb on Linux (soon to be increased to infinity, the Linux people say). I don't know about corresponding figures for Windows; we use Linux for development. There are systems that go higher, but we don't have access to them.
      The compressed text will hit the limit first. MG stores the compressed text in a single file. 7 Gb will compress to just under 2 Gb, so you can't go much higher without splitting the compressed-text file (hacky, but probably easy).
    2. Technical. There is a Huffman coding limitation which we would expect to run into at collections of around 16 Gb. However, the solution is very easy, we just haven't bothered to implement it until we have encountered the problem.
    3. Build time. For building a single index on an already-imported collection, extrapolations indicate that on a modern machine with 1 Gb of main memory, you should be able to build a 60 Gb collection in about 3 days. However, there are often large gaps between theory and practice in this area! The more indexes you have, the longer things take to build.
    In practice, the solution for very large amounts of data is not to treat the collection as one huge monolith, but to partition it into subcollections and arrange for the search engine to search them all together behind the scenes. However, while you can amalgamate the results of searching subcollections fairly easily, it's much harder with browsing. Of course, A-Z lists and datelists and the like aren't really much use with very large collections. This is where new techniques of hierarchical phrase browsing come into their own. And the really good news is that you can partition a collection into subcollections, each with individual phrase browsers, and arrange to view them all together in a single hierarchical browsing structure, as one coordinated whole. We haven't actually demonstrated this yet, but it seems quite feasible.

    A test collection was built by "Archivo Digital", an office that depends on the "Archivo Nacional de la Memoria" (National Memory Archive in English), in Argentina. It contained sequences of page images with associated OCR text.

    Setup details

    Statistics

    } ####################################################################### package faqplugins _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headingplugins_) _faq:faqmainlink_ _faqdefs_ } _faqdefs_ { _faq:faqdef_(plugins0,_tfaqplugins0title_,_faqplugins:available_) _faq:faqdef_(plugins1,_tfaqplugins1title_,_faqplugins:metadata_) _faq:faqdef_(plugins2,_tfaqplugins2title_,_faqplugins:pdfproblems_) _faq:faqdef_(plugins3,_tfaqplugins3title_,_faqplugins:unknownplug_) } _available_ { See this page. } # base puts in surrounding

    and

    , so skip first and last ones # _metadata_ { "Default" means that the metadata fields will be automatically assigned (or extracted if possible), while the "Available fields" lists other items of metadata that the plugin may be able to assign based on any arguments given to that plugin in the collect.cfg file. All plugins are derived from BasPlug, and have following metadata fields:
    Default fields Available fields
    BasPlug Language, Encoding, Source FirstNNNN, Keyphrases, Acronym

    In addition, many plugins have additional fields available:
    Plugin name Default fields Available fields
    BibTexPlug Title, Creator, Abstract, Author, Booktitle, Chapter, Copyright, Date, Edition, Editor, EntryType Journal, Keywords, Month, Note, Number, Pages, Publisher, PublisherAddress, Volume, Year  
    DBPlug   (arbitrary metadata field names based on Database configuration file)
    EMAILPlug Date, DateText, From, FromAddr, FromName, Headers, Subject, Title (based on subject, from, and date), To  
    ExcelPlug   (all fields as in HTMLPlug)
    HTMLPlug Title, URL Author, Creator, Email (others as found in the -metadata_fields option)
    ImagePlug Image, ImageHeight, ImageSize, ImageType, ImageWidth, ScreenHeight, screenicon, ScreenSize, ScreenType, ScreenWidth, Source, srclink, srcicon, Thumb, ThumbHeight, ThumbType, ThumbWidth  
    IndexPlug as in the index.txt file (use metadata.xml files instead of using this plugin)
    MARCPlug Creator, Description, MarcIdentifier, MarcSource, URL, Publisher, Relation, Rights, Subject, Title, Type (Metadata fields as in the marctodc.txt file)
    OAIPlug URL, (all metadata in .oai markup file)  
    PagedImgPlug Image, ImageHeight, ImageSize, ImageType, ImageWidth, ScreenHeight, screenicon, ScreenSize, ScreenType, ScreenWidth, Source, srclink, srcicon, Thumb, ThumbHeight, ThumbType, ThumbWidth  
    PDFPlug   (all fields in HTMLPlug)
    PPTPlug   (all fields in HTMLPlug)
    PSPlug Title Date, Pages, (all fields in TextPlug)
    ReferPlug Abstract, BookConfOnly, Booktitle, Copyright, Creator, Date, Editor, Keywords, Journal, JournalsOnly, Number, Pages, Publisher, Publisheraddr, Report, Title, Volume  
    RTFPlug   (all fields in HTMLPlug)
    SRCPlug Title, filename, includes, class, classdecl  
    TEXTPlug Title  
    UnknownPlug (as given in the -assoc_field plugin argument)  
    WordPlug   (all fields in HTMLPlug)

    See section two of the _docs:developersguide_ for information about options to plugins, or run the pluginfo.pl command on the plugin name after setting up your environment for Greenstone. (For example, "perl -S pluginfo.pl BasPlug".)

    In addition, every document can be manually assigned arbitrary metadata fields and values through use of metadata.xml files, as discussed in the manual. } # base puts in surrounding

    and

    , so skip first and last ones # _pdfproblems_ { PDF is a "page description language". This means that the document contains objects and commands such as "draw this text here" and "draw this image here".

    Greenstone uses an external program called "pdftohtml" to extract text out of PDF files. Sometimes, there is no text that can be extracted. This often depends on how the PDF was created.

    1. Adobe Acrobat Writer can be used to create PDFs from paper documents that are scanned in by a scanner. In this case, the PDF file contains images of text, rather than computer-readable text. Therefore, pdftohtml cannot find any text to extract.
    2. Some programs (such as older versions of GNU ghostscript, which is used by ps2pdf on Unix computers) sometimes create "bitmap fonts", which means that every character in the document is really an image rather than a computer readable letter. The LaTeX type-setting program sometimes does this when the "Computer Modern Roman" font is used.
    3. Certain characters and character combinations may be extracted incorrectly, depending on the program that generated the PDF file. For example, "ligatures" such as "fi", "fl", "ff" and "ffl" are often rendered using a special glyph rather than as individual characters, and this information may be lost in the textual representation. Also, some PDF generating programs may not correctly encode accented characters. For example, to draw a lowercase "u" with an umlaut accent, LaTeX draws a "u" and then draws an umlaut accent over it. This means that pdftohtml will extract two separate characters (¨ and 'u') rather than a single accented character (ü).
    4. PDF contains pieces of text, and coordinates for where that text should be displayed. This means that pdftohtml may incorrectly guess the order that the text fragments are supposed to occur in. For example, for text that is in two or more columns, the text may be extracted as the first sentence of each column, then the second sentence of each column, and so on. In this case, the extracted text is still usable for indexing purposes, but should not be displayed. In this case, a format statement should be added to the collect.cfg file to provide a link to the original PDF file but not to the extracted text, such as:
      format SearchVList "<td valign=top>[srclink][srcicon][/srclink]</td> <td>[srclink][Title][/srclink]</td>"
    5. Because of the way that images are embedded in PDF files, pdftohtml occasionally extracts an image upside-down, or mirrored. This appears to be a bug in the program.
    } _unknownplug_ {UnknownPlug is a simple plugin for importing files in formats that Greenstone doesn't know anything about. A dummy document will be created for every such file, and the file itself will be passed to Greenstone as the \"associated file\" of the document.

    Here's an example where it is useful: A collection has pictures and includes a couple of quicktime movie files with names like DCP_0163.MOV. Rather than write a new plugin for quicktime movies, add this line to the collection configuration file:

    plugin UnknownPlug -process_exp "*.MOV" -assoc_field "movie"

    A document is created for each movie, with the associated movie file's name in the "movie" metadata field. In the collection's format strings, use the \{If\} macro to output different text for each type of file, like this:

    \{If\}\{[movie],<HTML for displaying movie>\}
    \{If\}\{[Image],<HTML for displaying image>\}

    You can also add extra metadata, such as the Title, Subject, and Duration, using the Librarian Interface (or with metadata.xml files and RecPlug).

    The -process_exp option is a regular expression that matches filenames which should be processed by UnknownPlug. You can have several UnknownPlugs specified for a collection, each processing a different kind of file.

    The -assoc_field option is the name of the metadata field that will hold the associated file's name. This can be used to test for these files. You can also specify the mime type of the files to be processed using the -mime_type option. To display the original file, use [srclink][/srclink] metadata. } ####################################################################### package faqcustomize _content_ { _pageheading_(_faq:greenstonefaq_ - _faq:headingcustomize_) _faq:faqmainlink_

    _faqdefs_ } _faqdefs_ { _faq:faqdef_(customizefrontpagelogo,_tfaqcustomizefrontpagelogotitle_,_faqcustomize:tfaqcustomizefrontpagelogo_) _faq:faqdef_(customizemoreinfo,_tfaqcustomizemoreinfotitle_,_faqcustomize:tfaqcustomizemoreinfo_) _faq:faqdef_(customizeformat,_tfaqcustomizeformattitle_,_faqcustomize:tfaqcustomizeformat_) _faq:faqdef_(customizemetadatalinking,_tfaqcustomizemetadatalinkingtitle_,_faqcustomize:tfaqcustomizemetadatalinking_) _faq:faqdef_(customizenewpage,_tfaqcustomizenewpagetitle_,_faqcustomize:tfaqcustomizenewpage_) _faq:faqdef_(customizenotext,_tfaqcustomizenotexttitle_,_faqcustomize:tfaqcustomizenotext_) } _tfaqcustomizefrontpagelogo_ { You can change the logo that appears at the top of the front page of your library by editing the home.dm file in your Greenstone "macros" folder and replacing all instances of "gsdlhead.gif" with the name of your logo image file. } _tfaqcustomizemoreinfo_ { Try this document. } _tfaqcustomizeformat_ { Section 2.3 of the Greenstone Developer's Guide discusses how to format the output of your collection. However, the list of options is incomplete. The full list of formatting options is shown here. But for more information about how to use these options, the developer's guide is the place to go.

    Site-wide formatting options
    These should be placed in gsdl/etc/main.cfg.
    Syntax: SiteFormat <option-name> <option-value>

    ItemDescription
    HomePageCols intSet the number of columns used to display collections on the home page.
    Default: 3
    HomePageType pulldownDisplay the collection list on the home page as a pulldown menu, rather than using the default table of collection images. This alters the html that appears in the dynamically generated _homeextra_ macro. You can then move this macro around in home.dm.
    Default: not set

    Collection-specific formatting options
    These should be placed in gsdl/collect/<collname>/etc/collect.cfg.
    Syntax: format <option-name> <option-value>

    ItemDescription
    DocumentImages true/falseIf true, display a cover image at the top left of the document page
    Default: false
    DocumentTitles true/falseIf DocumentImages is false, and this is true, use DocumentHeading to display the title.
    Default: true
    DocumentHeading formatstringThis is used for a document heading at the top left if DocumentImages is false and DocumentTitles is true.
    Default: \{Or\}\{[parent(Top):Title],[Title],untitled\} <br>[Title]
    DocumentContents true/falseDisplay table of contents (if document is hierarchical), or next/previous section arrows and "page k of n" text (if document is paged)
    Default: true
    DocumentButtons stringControls the buttons that are displayed on a document page. Valid options are Detach, Highlight, Expand Text, Expand Contents. Should be separated by |.
    Default: "Detach|Highlight"
    DocumentText formatstringFormat of the text to be displayed on a document page
    Default: <center> <table width=537> <tr><td>[Text]</td></tr> </table> </center>
    DocumentArrowsTop true/falseDisplay next/previous section arrows at top of document, underneath the navigation bar, on document page
    Default: false
    DocumentArrowsBottom true/falseDisplay next/previous section arrows at bottom of document page
    Default: true
    DocumentUseHTML true/falseIf true, each document is displayed in a separate frame. The Preferences page will also change slightly, adding options applicable to a collection of HTML documents.
    Default: false
    NavigationBar pulldownIf set, provides a drop down list in place of the usual navigation bar (that contains search and classifier options). This alters the html that appears in the dynamically generated _navigationbar_ macro.
    Default: not set
    AllowExtendedOptions true/falseThis allows the entire content of the document page to be controlled by format statements. Use DocumentHeading and DocumentText to format the document. This option prevents the other hard coded stuff (table of contents, buttons etc) from being output. It effectively disables the DocumentContents, DocumentButtons, DocumentImages format options. New format items are provided for use in format statements if AllowExtendedOptions is true (see table below)
    Default: false

    Formatting Lists
    The standard use of format statements is for the lists in search results, classifiers etc. Here is a list of the various lists available for format, and what they control. Note that classifiers are numbered from 1 upwards, in the order that they appear in the config file.

    ItemDescription
    VListApplies to all vertical lists, unless overridden by a more specific format item. These include search results, classifier lists, and document table of contents
    HListApplies to all horizontal lists. Horizontal lists are often used in classifiers, particularly AZ[Compact][Section]Lists
    DateListApplies to all date lists - these are the vertical lists generated by a DateList classifier.
    SearchVListThe vertical list of search results
    DocumentVListThe document table of contents
    CL1VListApplies only to the vertical list of classifier 1
    CL1HListApplies only to the horizontal list of classifier 1
    CL1DateListApplies only to the DateList in classifier 1

    Formatstring items

    ItemDescription
    [link][/link]Link to the document (Greenstone version)
    [srclink][/srclink]Link to the original document (only if the original was converted to another form)
    [icon]An appropriate icon for a classifier/document node. E.g. bookshelf, book, chapter, page
    [srcicon]An appropriate icon for the original source document. E.g. Word, PDF, PS icon.
    [num]The document number (position in the search results - useful for debugging)
    [numleafdocs]The number of documents below the current classifier node. This is often used as a test for classifier nodes, as numleafdocs will not be set for documents. This allows different formatting for classifier nodes and document nodes in a hierarchy.
    [Text]The text of the current section
    [RelatedDocuments]Related Documents info (if available). This is a vertical list of Titles (or Subjects if Titles aren't available) that link to the related documents. It is based on "relation" metadata, which is a space separated list of collection,OID pairs.
    [highlight][/highlight]These are used for 'highlighting' (actually bolding) the selected section in a hierarchical table of contents, and the selected node in a classifier. Apart from those two cases, this has no effect. If you actually want to highlight/bold/italicise something in a list, and have it apply to all items in the list, then either use actual html tags, like <b></b>, <u></u> and <i></i>, or use the _starthighlight_ and _endhighlight_ macros (defined in macros/base.dm).
    [Summary]Displays Summary metadata if available, otherwise displays a short summary created on the fly.
    [DocOID]The internal identifier of the document
    [DocRank]The rank of the current document - used in search results
    [DocImage]The URL to the cover image of the document
    [collection]The directory name of the collection this document is from - useful in cross-collection searching. (version 2.61)
    [collection:meta-name]A collection metadata for the collection this document is from - useful in cross-collection searching. E.g. [collection:collectionname]. This will display in the current language if an appropriate version is available. (version 2.61)
    [metadata-name]The value of this metadata element for the document

    Extended metadata names
    There are a few options for displaying metadata. The basic way is to specify e.g. [Title] or [dc.Title]: this displays the value of that particular metadata element for the current document/section. Metadata names can be prefixed by parent: or sibling. The following examples all use Title or Subject metadata, but any metadata could be used, including ones with namespaces (e.g. dc.Title). Any metadata name can also be prefixed by "cgisafe:". This results in the value being formatted so that it is safe to put in a URL.

    [parent:Title]The Title of the immediate parent section
    [parent(Top):Title]The Title of the topmost parent section
    [parent(All):Title]All Titles of the parent sections, separated by ", "
    [parent(All': '):Title]All Titles of the parent sections, separated by ": " (or whatever appears inside the ' ')
    [child:Subject]The Subjects of all child nodes of the current node, separated by ' '. (child modifier avilable from version 2.61)
    [child(All'xxx'):Subject]The Subjects of all child nodes of the current node, separated by xxx
    [child(2):Subject]The Subject of the second child of the current node. Child numbering starts from 1.
    [child(last):Subject]The Subject of the last child of the current node. 'first' is also a valid option.
    [sibling:Subject]All Subjects of the current section, separated by ", ". This is used for displaying metadata where there is more than one value. [Subject] will just display the first value.
    [sibling(All'<br>'):Subject]All Subjects of the current section, separated by <br>.
    [sibling(2):Subject]The second Subject metadata value for the current node. Numbering starts from 1.
    [sibling(last):Subject]The last Subject metadata value for the current node. 'first' is also a valid option.
    [parent:sibling:Subject]sibling can be combined with parent to give all (or specific) values for the parent node(s). All parent and sibling qualifiers can be used. (version 2.61)
    [child:sibling:Subject]sibling can be combined with child to give all (or specific) values for the child node(s). All parent and child qualifiers can be used. (version 2.61)
    [cgisafe:parent(Top):Title]The Title of the topmost parent section, made safe for URLs.
    [cgisafe:sibling(All'<br>'):Subject]All Subjects of the current section, separated by <br>, made safe for URLs.

    Extended Formatstring items
    These items are only available if AllowExtendedOptions is true.

    ItemDescription
    [DocumentButtonDetach]The Detach button
    [DocumentButtonHighlight]The Highlight button
    [DocumentButtonExpandText]The Expand Text button
    [DocumentButtonExpandContents]The Expand Contents button
    [DocTOC]The table of contents for a hierarchical document, or the next/previous and go to page x bits for a paged document.

    Conditional expressions in formatstrings
    \{If\}\{[metadata], action-if-non-null, action-if null\}If there is a value for this metadata element, then output the first clause, otherwise output the second clause. Either clause is optional: if empty, nothing will be done for that case.
    This is useful for displaying classifier nodes differently to document nodes: \{If\}\{[numleafdocs],format for classifier,format for document\}
    \{If\}\{[metadata] op value, action-if-true, action-if-not-true\}Can do tests on metadata values. These can be string comparisons, or numeric comparisons. Valid operators are:
    StringNumericMeaning
    eq==equals
    ne!=not equals
    gt>greater than
    ge>=greater than or equal to
    lt<less than
    le<=less than or equal to
    swstarts with
    ewends with

    Note that only eq and ne are available for Greenstone versions 2.60 and earlier.
    \{Or\}\{[metadata], [metadata2], [metadata3]...\}Each metadata is evaluated in turn, and the first one that exists is output
    Useful for cases where there are different namespaced version of the same metadata, e.g. \{Or\}\{[dc.Title],[dls.Title],[Title],Untitled\}. The last item can be plain text.
    nested If/Or\{Or\} can have another conditional as its final option, eg \{Or\}\{[BookTitle],[Title],\{If\}\{[XXX],aaa,bbb\} \}. The following is not valid: \{Or\}\{[BookTitle],[Title]\{If\}\{[XXX],aaa,bbb\} \}.
    \{If\} can have nested conditionals at either true/false option. eg. \{If\}\{[numleafdocs],[Title],[BookTitle]\{If\}\{[Subject],[Subject],unclassified\} \}
    } _tfaqcustomizemetadatalinking_ { [contributed by Axel Schild]
    When a metadata element has only one value, it is easy to make a hyperlink out of the value. In the format statement, you just put an <a> tag around the metadata item, for example:
    <a href="url to link to">[dc.Subject]</a>
    When the metadata item has multiple values, and you want to link each one separately, it is a bit more difficult. The following is Axel's solution to his particular problem: display all the Creator elements, each one hyperlinked to a search of that Creator in the Creators index.

    Use the format string below in the collect.cfg file (in this case, as part of the "format DocumentText" statement)

    \{If\}\{[dc.Creator],
    <tr>
    <td align=right valign=top><b>Authors:</b></td>
    <td align=left valign=bottom><label name=AuthorField id=AuthorField>
    \_httpquery\_;[cgisafe:sibling(All:\\' ; \\'):dc.Creator];[sibling(All:\\'\_\\'):dc.Creator]
    </label></td>
    </tr>\} 
    
    This statement includes a label definition with the name "AuthorField". "\_httpquery\_" is a macro which resolves into the http-address of the query page of the collection. "[cgisafe:sibling(All:\\' ; \\'):dc.Creator]" displays all Creators, separated by ; and with any special characters escaped for use within a web address. [sibling(All:\\'\_\\'):dc.Creator] produces a similar string without escaping the special characters. Notice the different separation symbols, these are needed later on.

    Additional changes have to be made in order to make this whole thing work. You further need to change the \_header\_ or \_textheader\_ macro in the package of the page the format string will be displayed in (in this case the document package). The change is that \_htmlhead\_ has to be parametrized with \_htmlhead\_(onload="ExtractAuthors();"), where ExtractAuthors(); is a Javascript function that is called on loading the corresponding page (the document display page). Since you do not want to mess in the standard macro files, create an extra.dm file (in gsdl/collect/<collname>/macros) and override the chosen macro with a collection specific macro. In this example this is done by the code sequence

    package document
    
    ###document display
    
    ###HTML-Page Header
    \_textheader\_ [c=exacol] \{\_cgihead\_
    \_htmlhead\_(onload="ExtractSubjects();ExtractAuthors();")
    <center>
    <table width=_pagewidth_><tr><td align=right>
    \_icontab\_\_javalinks\_</td></tr></table>
    </center>
    \}
    
    Now all that is missing is the Javascript function which has to be included into the \_pagescriptextra\_ macro of the same package. Copy this macro out of the corresponding standard macro file and paste it into your extra.dm file. Make the necessary modification which is in this case
    ### Self-made Javascript functions
    \_pagescriptextra\_\{
    function ExtractAuthors() \\\{
      var res;
      a = AuthorField.outerText.split(";");
      resolver = a[0]+"&q=";
      b = a[1].split("+%3b+");
      c = a[2].split("\_");
      res = "";
      for (i = 0; i < b.length ;i++)
        \\{
           res = res + "<a href=" + resolver + b[i]+ "&h=dd0&t=0>" + c[i] + "</a><br/>";
        \\}
      AuthorField.outerHTML = res;
    \\}
    \}
    
    This Javascript function evaluates the string of the defined label, splits it into several strings and composes a string out of those values, which is then set to the "outerHTML" element of the label. "&h=dd0" indicates which index to search in; dd0 should be replaced with the name of the appropriate index. The file gsdl/collect/<collname>/index/build.cfg gives the names of the various indexes. } _tfaqcustomizenewpage_ { Adding a new static page to Greenstone is relatively simple and involves only customizations to macro files (found in gsdl/macros). (Adding a dynamic page is more difficult and involves adding a new action to the C++ receptionist.)

    Lets look at the example of adding a new static page called mypage.

    Link to the new page from the home page
    Edit the \_content\_ macro in home.dm, adding the link where you want it. The link will look something like <a href="\_httppagex\_(mypage)">My Page</a>.
    Create the macro file
    Create a new file called mypage.dm in gsdl/macros.
    Tell the library about the new file
    Edit gsdl/etc/main.cfg. Add mypage.dm to the macrofiles list
    mypage.dm will contain all the content on the page. A basic mypage.dm might look something like the following.
    package mypage
    
    \_pagetitle\_ \{My New Page\}
    
    \_content\_ \{
    <h2>My New Page</h2>
    This is the content of the page.
    \}
    
    The URL for the new page can be specified internally as \_httppagex\_(mypage): this maps to your library address with cgi arguments a=p&p=mypage.

    If this is not a page that belongs to a collection, you should modify the 'home help prefs' buttons. You may like to add the following to your mypage.dm.

    \_javalinks\_ \{\_imagehome\_ \_imagehelp\_ \_imagepref\_ \}
    
    \_imagehelp\_ \{\_gsimage\_(\_home:httppagehomehelp\_,\_httpiconchelpof\_,\_httpiconchelpon\_,help,\_textimagehelp\_)\}
    
    \_imagepref\_ {\_gsimage\_(\_home:httppagehomepref\_,\_httpiconcprefof\_,\_httpiconcprefon\_,pref,\_textimagepref\_)\}
    
    This will give you the standard home button, and the help and preferences buttons that link to the same pages as those on the home page. Help and preferences are different depending on whether you are in a collection or not.

    If you are using the Windows local library server you probably need to restart it for the changes to take effect. } _tfaqcustomizenotext_ { Instead of [Text] in the DocumentText format statement, use
    \{If\}\{[Text] ne 'This document has no text. ',[Text]\}
    If you have installed Greenstone in a different language, then you need to put the correct language string into the If statement. (Since version 2.62.) }