source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 6138

Last change on this file since 6138 was 6138, checked in by mdewsnip, 20 years ago

Added plugin type metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ISISPlug is a sub-class of SplitPlug.
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "string",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() },
46 { 'name' => "block_exp",
47 'desc' => "{BasPlug.block_exp}",
48 'type' => "string",
49 'deft' => &get_default_block_exp() },
50 { 'name' => "subfield_separator",
51 'desc' => "{ISISPlug.subfield_separator}",
52 'type' => "string",
53 'reqd' => "no",
54 'deft' => ", " },
55 { 'name' => "entry_separator",
56 'desc' => "{ISISPlug.entry_separator}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => "<br>" } ];
60
61my $options = { 'name' => "ISISPlug",
62 'desc' => "{ISISPlug.desc}",
63 'inherits' => "Yes",
64 'args' => $arguments };
65
66
67# This plugin processes files with the suffix ".mst"
68sub get_default_process_exp {
69 return q^(?i)(\.mst)$^;
70}
71
72
73# This plugin blocks files with the suffix ".fdt" and ".xrf"
74sub get_default_block_exp {
75 return q^(?i)(\.fdt|\.xrf)$^;
76}
77
78
79# This plugin splits the input text at the "----------" lines
80sub get_default_split_exp {
81 return q^\n----------\n^;
82}
83
84
85sub new {
86 my $class = shift(@_);
87
88 my $self = new SplitPlug($class, @_);
89 if (!parsargv::parse(\@_,
90 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
91 q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
92 "allow_extra_options")) {
93 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
94 die "\n";
95 }
96
97 # To allow for proper inheritance of arguments
98 my $option_list = $self->{'option_list'};
99 push(@{$option_list}, $options);
100 $self->{'plugin_type'} = "ISISPlug";
101
102 return bless $self, $class;
103}
104
105
106sub read_file {
107 my $self = shift (@_);
108 my ($filename, $encoding, $language, $textref) = @_;
109
110 my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
111
112 # The text to split is exported from the database by the IsisGdl program
113 open(FILE, "IsisGdl $filename |");
114
115 my $reader = new multiread();
116 $reader->set_handle ('ISISPlug::FILE');
117 $reader->set_encoding ($encoding);
118 $reader->read_file ($textref);
119
120 close(FILE);
121
122 # Parse the associated ISIS database Field Definition Table file (.fdt)
123 my $fdtfilename = $databasename . ".fdt";
124 my %fdtmapping = &parse_field_definition_table($fdtfilename);
125
126 # Map the tag numbers to tag names, using the FDT mapping
127 $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
128
129 # Add a newline at the start so it is split properly
130 $$textref = "\n" . $$textref;
131}
132
133
134sub process
135{
136 my $self = shift (@_);
137 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
138 my $outhandle = $self->{'outhandle'};
139
140 my $subfield_separator = $self->{'subfield_separator'};
141 my $entry_separator = $self->{'entry_separator'};
142
143 # Report that we're processing the file
144 print $outhandle "IsisPlug: processing $file\n"
145 if ($self->{'verbosity'}) > 1;
146
147 # Process each line of the ISIS record, one at a time
148 foreach $line (split(/\n/, $$textref)) {
149 $line =~ /^tag=(.+) data=(.+)$/;
150 local $rawtagname = $1;
151 local $rawtagdata = $2;
152 # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
153
154 # Metadata field names: title case, then remove spaces
155 local $tagname = "";
156 foreach $word (split(/\s+/, $rawtagname)) {
157 substr($word, 0, 1) =~ tr/a-z/A-Z/;
158 $tagname .= $word;
159 }
160
161 # Make sure there is nothing bad in the tag names
162 $tagname =~ s/&//g;
163
164 # Handle each piece of metadata ('%' separated)
165 local $completetagvalue = "";
166 foreach $rawtagvalue (split(/%/, $rawtagdata)) {
167 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
168
169 # Metadata field values: take care with subfields
170 local $completeentryvalue = "";
171 while ($rawtagvalue ne "") {
172 # If there is a subfield specifier, parse it off
173 local $subfieldname = "";
174 if ($rawtagvalue =~ s/^\^([a-z])//) {
175 $subfieldname = "." . $1;
176 }
177
178 # Parse the metadata value off
179 $rawtagvalue =~ s/^([^\^]*)//;
180 local $metadatafieldname = $tagname . $subfieldname;
181 local $metadatafieldvalue = $1;
182 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
183
184 # Handle Keywords specially
185 if ($metadatafieldname eq "Keywords") {
186 local $keywordmetadatavalue = $metadatafieldvalue;
187 local $keywordlist = "";
188 while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
189 local $keyword = $1;
190 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
191 $keywordlist .= ", " unless ($keywordlist eq "");
192 $keywordlist .= $keyword;
193 }
194
195 $metadatafieldvalue = $keywordlist;
196 }
197
198 else {
199 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
200 }
201
202 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
203 $completeentryvalue .= $metadatafieldvalue;
204 }
205
206 $completetagvalue .= $completeentryvalue;
207 }
208
209 # print "Metadata: $tagname.all -> $completetagvalue\n";
210 $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
211 }
212 # print "\n";
213
214 # Add the full record as the document text
215 $$textref =~ s/\</&lt;/g;
216 $$textref =~ s/\>/&gt;/g;
217 $doc_obj->add_utf8_text ($cursection, $$textref);
218
219 # Document was processed successfully
220 return 1;
221}
222
223
224sub parse_field_definition_table
225{
226 local $fdtfilename = shift(@_);
227
228 local %fdtmapping = ();
229
230 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
231
232 local $amongstdefinitions = 0;
233 foreach $fdtfileline (<FDT_FILE>) {
234 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
235
236 if ($amongstdefinitions) {
237 local $fieldtitle = substr($fdtfileline, 0, 30);
238 local $fieldsubfields = substr($fdtfileline, 30, 20);
239 local $fieldspecs = substr($fdtfileline, 50);
240
241 # Remove extra spaces
242 $fieldtitle =~ s/(\s*)$//;
243 $fieldsubfields =~ s/(\s*)$//;
244
245 # Map from tag number to metadata field title and subfields
246 local $fieldtag = (split(/ /, $fieldspecs))[0];
247 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
248 'subfields' => $fieldsubfields };
249 }
250 elsif ($fdtfileline eq "***") {
251 $amongstdefinitions = 1;
252 }
253 }
254
255 close(FDT_FILE);
256
257 return %fdtmapping;
258}
259
260
2611;
Note: See TracBrowser for help on using the repository browser.