########################################################################### # # mgbuildproc.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package niupepabuildproc; use mgbuildproc; use unicode; BEGIN { @ISA = ('mgbuildproc'); } sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $outhandle) = @_; my $self = new mgbuildproc (); return bless $self, $class; } # We want to strip out the macrons if we're indexing the text - we then # strip any macrons from search terms at runtime so that we'll hit all # versions of a word, with or without macrons. # Note that this behaviour may not be best in all cases with Maori text as # there are examples of completely different words being spelt the same # except for their respective macrons. For this collection it seems like # the best solution though. sub filter_text { my $self = shift(@_); if ($self->{'indexing_text'}) { my $unicode_array = &unicode::utf82unicode($_[1]); foreach my $c (@$unicode_array) { if ($c == 256) {$c = ord("A");} elsif ($c == 257) {$c = ord("a");} elsif ($c == 274) {$c = ord("E");} elsif ($c == 275) {$c = ord("e");} elsif ($c == 298) {$c = ord("I");} elsif ($c == 299) {$c = ord("i");} elsif ($c == 332) {$c = ord("O");} elsif ($c == 333) {$c = ord("o");} elsif ($c == 362) {$c = ord("U");} elsif ($c == 363) {$c = ord("u");} } $_[1] = &unicode::unicode2utf8($unicode_array); } } # all documents are "paged" for this collection sub get_document_type { return ("Invisible", "Paged"); } 1;