###########################################################################
#
# TEXTPlug.pm -- simple text plugin
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# creates simple single-level document. Adds Title metadata
# of first line of text (up to 100 characters long).
# 12/05/02 Added usage datastructure - John Thompson
package TEXTPlug;
use BasPlug;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
sub BEGIN {
@TEXTPlug::ISA = ('BasPlug');
}
my $arguments =
[ { 'name' => "process_exp",
'desc' => "{BasPlug.process_exp}",
'type' => "regexp",
'deft' => &get_default_process_exp(),
'reqd' => "no" } ,
{ 'name' => "title_sub",
'desc' => "{TEXTPlug.title_sub}",
'type' => "regexp",
'deft' => "",
'reqd' => "no" } ];
my $options = { 'name' => "TEXTPlug",
'desc' => "{TEXTPlug.desc}",
'abstract' => "no",
'inherits' => "yes",
'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
return bless $self, $class;
}
sub get_default_process_exp {
my $self = shift (@_);
return q^(?i)\.te?xt$^;
}
# do plugin specific processing of doc_obj
sub process {
my $self = shift (@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
my $outhandle = $self->{'outhandle'};
print STDERR "\n$$textref\n
");
return 1;
}
# replace_srcdoc_with_html.pl requires all subroutines that support src_replaceable
# to contain a method called tmp_area_convert_file - this is indeed the case with all
# Perl modules that are subclasses of ConvertToPlug.pm, but as we want TEXTPlug to also
# be srcreplaceable and because TEXTPlug does not inherit from ConvertToPlug.pm, we have
# this ugly solution: same subroutine name.
# Despite the subroutine name, this method does not in fact create the output html file in
# some tmp folder. Instead, it creates the html file in the same folder as the input_filename
# and writes the contents as html paragraphs nested inside
# It also sets the encoding of the html document created to UTF-8 in the head's meta tag.
# Note: doesn't seem to be able to cope with
and -> slashes are a problem.
# As a consequence, we resorted to making it not proper xhtml but just regular html.\
# The output file's name will be utf8 AND might not be the same as the input file's name
# (for instance, the output filename may have a number appended to it if there is already an html
# file in the input folder with the same name).
sub tmp_area_convert_file {
my $self = shift (@_);
my ($output_ext, $input_filename) = @_;
#my $outhandle = $self->{'outhandle'};
#my $failhandle = $self->{'failhandle'};
#my $convert_to_ext = $self->{'convert_to_ext'};
# derive output filename from input filename
my ($tailname, $dirname, $suffix)
= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
# convert to utf-8 otherwise we have problems with the doc.xml file
# later on
&unicode::ensure_utf8(\$tailname); # TO DO: does this change the filename or not?
#my $output_filename = $tailname$output_ext;#output_ext has to be html!
my $output_filename = &util::filename_cat($dirname, $tailname.".html");
# read contents of text file line by line into an array
# create an HTML file from the text file
# Recreate the original file for writing the updated contents
unless(open(TEXT, "<$input_filename")) { # open it as a new file for writing
print STDERR "TEXTPlug.pm: Unable to open and read from $input_filename for converting to html...ERROR\n";
return 0;
}
my @lines = ();
my $line;
my $newpara = 1; # true whenever we're going to start a new para
while ($line=
".$line); # start a new paragraph
$newpara = 0;
} else { # text-line is not a new paragraph, but just a new line
push(@lines, "
\n".$line); # put a break. It doesn't seem to accept
}
}
}
close TEXT;
# we've come to the last line of input file, make sure that the text ends on