source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 6812

Last change on this file since 6812 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.6 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ISISPlug is a sub-class of SplitPlug.
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "entry_separator",
42 'desc' => "{ISISPlug.entry_separator}",
43 'type' => "string",
44 'reqd' => "no",
45 'deft' => "<br>" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "block_exp",
52 'desc' => "{BasPlug.block_exp}",
53 'type' => "regexp",
54 'deft' => &get_default_block_exp() },
55 { 'name' => "subfield_separator",
56 'desc' => "{ISISPlug.subfield_separator}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => ", " },
60 { 'name' => "split_exp",
61 'desc' => "{SplitPlug.split_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_split_exp(),
64 'reqd' => "no" }
65 ];
66
67my $options = { 'name' => "ISISPlug",
68 'desc' => "{ISISPlug.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73
74# This plugin processes files with the suffix ".mst"
75sub get_default_process_exp {
76 return q^(?i)(\.mst)$^;
77}
78
79
80# This plugin blocks files with the suffix ".fdt" and ".xrf"
81sub get_default_block_exp {
82 return q^(?i)(\.fdt|\.xrf)$^;
83}
84
85
86# This plugin splits the input text at the "----------" lines
87sub get_default_split_exp {
88 return q^\n----------\n^;
89}
90
91
92sub new {
93 my $class = shift(@_);
94
95 my $self = new SplitPlug($class, @_);
96 if (!parsargv::parse(\@_,
97 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
98 q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
99 "allow_extra_options")) {
100 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
101 die "\n";
102 }
103
104 # To allow for proper inheritance of arguments
105 my $option_list = $self->{'option_list'};
106 push(@{$option_list}, $options);
107 $self->{'plugin_type'} = "ISISPlug";
108
109 return bless $self, $class;
110}
111
112
113sub read_file {
114 my $self = shift (@_);
115 my ($filename, $encoding, $language, $textref) = @_;
116
117 my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
118
119 # The text to split is exported from the database by the IsisGdl program
120 open(FILE, "IsisGdl $filename |");
121
122 my $reader = new multiread();
123 $reader->set_handle ('ISISPlug::FILE');
124 $reader->set_encoding ($encoding);
125 $reader->read_file ($textref);
126
127 close(FILE);
128
129 # Parse the associated ISIS database Field Definition Table file (.fdt)
130 my $fdtfilename = $databasename . ".fdt";
131 my %fdtmapping = &parse_field_definition_table($fdtfilename);
132
133 # Map the tag numbers to tag names, using the FDT mapping
134 $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
135
136 # Add a newline at the start so it is split properly
137 $$textref = "\n" . $$textref;
138}
139
140
141sub process
142{
143 my $self = shift (@_);
144 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
145 my $outhandle = $self->{'outhandle'};
146
147 my $subfield_separator = $self->{'subfield_separator'};
148 my $entry_separator = $self->{'entry_separator'};
149
150 # Report that we're processing the file
151 print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
152 print $outhandle "IsisPlug: processing $file\n"
153 if ($self->{'verbosity'}) > 1;
154
155 # Process each line of the ISIS record, one at a time
156 foreach $line (split(/\n/, $$textref)) {
157 $line =~ /^tag=(.+) data=(.+)$/;
158 local $rawtagname = $1;
159 local $rawtagdata = $2;
160 # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
161
162 # Metadata field names: title case, then remove spaces
163 local $tagname = "";
164 foreach $word (split(/\s+/, $rawtagname)) {
165 substr($word, 0, 1) =~ tr/a-z/A-Z/;
166 $tagname .= $word;
167 }
168
169 # Make sure there is nothing bad in the tag names
170 $tagname =~ s/&//g;
171
172 # Handle each piece of metadata ('%' separated)
173 local $completetagvalue = "";
174 foreach $rawtagvalue (split(/%/, $rawtagdata)) {
175 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
176
177 # Metadata field values: take care with subfields
178 local $completeentryvalue = "";
179 while ($rawtagvalue ne "") {
180 # If there is a subfield specifier, parse it off
181 local $subfieldname = "";
182 if ($rawtagvalue =~ s/^\^([a-z])//) {
183 $subfieldname = "." . $1;
184 }
185
186 # Parse the metadata value off
187 $rawtagvalue =~ s/^([^\^]*)//;
188 local $metadatafieldname = $tagname . $subfieldname;
189 local $metadatafieldvalue = $1;
190 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
191
192 # Handle Keywords specially
193 if ($metadatafieldname eq "Keywords") {
194 local $keywordmetadatavalue = $metadatafieldvalue;
195 local $keywordlist = "";
196 while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
197 local $keyword = $1;
198 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
199 $keywordlist .= ", " unless ($keywordlist eq "");
200 $keywordlist .= $keyword;
201 }
202
203 $metadatafieldvalue = $keywordlist;
204 }
205
206 else {
207 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
208 }
209
210 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
211 $completeentryvalue .= $metadatafieldvalue;
212 }
213
214 $completetagvalue .= $completeentryvalue;
215 }
216
217 # print "Metadata: $tagname.all -> $completetagvalue\n";
218 $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
219 }
220 # print "\n";
221
222 # Add the full record as the document text
223 $$textref =~ s/\</&lt;/g;
224 $$textref =~ s/\>/&gt;/g;
225 $doc_obj->add_utf8_text ($cursection, $$textref);
226
227 # Document was processed successfully
228 return 1;
229}
230
231
232sub parse_field_definition_table
233{
234 local $fdtfilename = shift(@_);
235
236 local %fdtmapping = ();
237
238 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
239
240 local $amongstdefinitions = 0;
241 foreach $fdtfileline (<FDT_FILE>) {
242 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
243
244 if ($amongstdefinitions) {
245 local $fieldtitle = substr($fdtfileline, 0, 30);
246 local $fieldsubfields = substr($fdtfileline, 30, 20);
247 local $fieldspecs = substr($fdtfileline, 50);
248
249 # Remove extra spaces
250 $fieldtitle =~ s/(\s*)$//;
251 $fieldsubfields =~ s/(\s*)$//;
252
253 # Map from tag number to metadata field title and subfields
254 local $fieldtag = (split(/ /, $fieldspecs))[0];
255 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
256 'subfields' => $fieldsubfields };
257 }
258 elsif ($fdtfileline eq "***") {
259 $amongstdefinitions = 1;
260 }
261 }
262
263 close(FDT_FILE);
264
265 return %fdtmapping;
266}
267
268
2691;
Note: See TracBrowser for help on using the repository browser.