########################################################################### # # TEXTPlug.pm -- simple text plugin # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # creates simple single-level document from .txt or .text files # (case-insensitive match on filenames). Adds Title metadata # of first 100 characters found. package TEXTPlug; use BasPlug; use sorttools; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; my $filename = &util::filename_cat($base_dir, $file); return undef unless ($filename =~ /\.(te?xt(\.gz)?)$/i && (-e $filename)); my $gz = 0; if (defined $2) { $gz = $2; $gz = 1 if ($gz =~ /\.gz/i); } print STDERR "TEXTPlug: processing $filename\n" if $processor->{'verbosity'}; # create a new document my $doc_obj = new doc ($file, "indexed_doc"); if ($gz) { open (FILE, "zcat $filename |") || die "TEXTPlug::read - zcat can't open $filename\n"; } else { open (FILE, $filename) || die "TEXTPlug::read - can't open $filename\n"; } my $cursection = $doc_obj->get_top_section(); my $text = ""; my $line = ""; my $foundtitle = 0; # don't need to get title if it has been passed # in from another plugin if (defined $metadata->{'Title'}) { $foundtitle = 1; } while (defined ($line = )) { # use first line as title (or first 100 characters if it's long) if (!$foundtitle && length($line) > 5) { my $title = ""; if (length($line) > 100) { $title = substr ($line, 0, 100); } else { $title = $line; } $doc_obj->add_metadata ($cursection, "Title", $title); $foundtitle = 1; } $text .= $line; } $doc_obj->add_text ($cursection, "
\n$text\n
"); foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($cursection, $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } # add OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj); return 1; # processed the file } 1;