source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1240

Last change on this file since 1240 was 1229, checked in by sjboddie, 24 years ago

fixed bug in options

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28use parsargv;
29use multiread;
30use cnseg;
31use strict;
32
33sub print_usage {
34 print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n";
35 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
36
37 print STDERR "\n usage: plugin plugin-name [options]\n\n";
38 print STDERR " currently supported general options are:\n";
39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
40 print STDERR " converted from these encodings and stored internally as\n";
41 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n";
42 print STDERR " are:\n";
43 print STDERR " iso_8859_1 (extended ascii)\n";
44 print STDERR " Latin1 (the same as iso-8859-1)\n";
45 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n";
46 print STDERR " conversion is neccessary)\n";
47 print STDERR " gb (GB or GBK simplified Chinese)\n";
48 print STDERR " iso_8859_6 (8 bit Arabic)\n";
49 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n";
50 print STDERR " Arabic (the same as windows_1256)\n";
51 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n";
52 print STDERR " unicode (just unicode -- doesn't currently do endian\n";
53 print STDERR " detection)\n\n";
54}
55
56sub new {
57 my $class = shift (@_);
58
59 my $self = {};
60 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
61
62 # general options available to all plugins
63 if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'},
64 "allow_extra_options")) {
65 &print_usage();
66 die "\n";
67 }
68
69 return bless $self, $class;
70}
71
72sub begin {
73 my $self = shift (@_);
74 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
75}
76
77sub end {
78 my ($self) = @_;
79}
80
81# return 1 if this class might recurse using $pluginfo
82sub is_recursive {
83 my $self = shift (@_);
84
85 die "BasPlug::is_recursive function must be implemented in sub classes\n";
86}
87
88# return number of files processed, undef if can't process
89# Note that $base_dir might be "" and that $file might
90# include directories
91sub read {
92 my $self = shift (@_);
93 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
94
95 die "BasPlug::read function must be implemented in sub classes\n";
96
97 return undef; # will never get here
98}
99
100# uses the multiread package to read in the entire file pointed to
101# by filename and loads the resulting text into $$textref. Input text
102# may be in any of the encodings handled by multiread, output text
103# will be in utf8
104sub read_file {
105 my $self = shift (@_);
106 my ($filename, $textref) = @_;
107
108 $$textref = "";
109 my $encoding = "";
110 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
111 $encoding = "iso_8859_1";
112 } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
113 $encoding = "windows_1256";
114 } else {
115 $encoding = $self->{'input_encoding'};
116 }
117
118 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
119
120 if ($encoding eq "ascii") {
121 undef $/;
122 $$textref = <FILE>;
123 $/ = "\n";
124 } else {
125 my $reader = new multiread();
126 $reader->set_handle ('BasPlug::FILE');
127 $reader->set_encoding ($encoding);
128 $reader->read_file ($textref);
129
130 if ($encoding eq "gb") {
131 # segment the Chinese words
132 $$textref = &cnseg::segment($$textref);
133 }
134 }
135
136 close FILE;
137}
138
139# add any extra metadata that's been passed around from one
140# plugin to another.
141# extra_metadata uses add_utf8_metadata so it expects metadata values
142# to already be in utf8
143sub extra_metadata {
144 my $self = shift (@_);
145 my ($doc_obj, $cursection, $metadata) = @_;
146
147 foreach my $field (keys(%$metadata)) {
148 # $metadata->{$field} may be an array reference
149 if (ref ($metadata->{$field}) eq "ARRAY") {
150 map {
151 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
152 } @{$metadata->{$field}};
153 } else {
154 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
155 }
156 }
157}
158
1591;
Note: See TracBrowser for help on using the repository browser.