source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 7703

Last change on this file since 7703 was 7693, checked in by mdewsnip, 20 years ago

Improvements to the new code so that it works on Windows as well as Unix.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ISISPlug is a sub-class of SplitPlug.
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() },
46 { 'name' => "block_exp",
47 'desc' => "{BasPlug.block_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_block_exp() },
51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() },
56
57 # The interesting options
58 { 'name' => "entry_separator",
59 'desc' => "{ISISPlug.entry_separator}",
60 'type' => "string",
61 'reqd' => "no",
62 'deft' => "<br>" },
63 { 'name' => "subfield_separator",
64 'desc' => "{ISISPlug.subfield_separator}",
65 'type' => "string",
66 'reqd' => "no",
67 'deft' => ", " },
68 { 'name' => "document_field",
69 'desc' => "{ISISPlug.document_field}",
70 'type' => "string",
71 'reqd' => "no",
72 'deft' => "" },
73 { 'name' => "document_prefix",
74 'desc' => "{ISISPlug.document_prefix}",
75 'type' => "string",
76 'reqd' => "no",
77 'deft' => "" },
78 { 'name' => "document_suffix",
79 'desc' => "{ISISPlug.document_suffix}",
80 'type' => "string",
81 'reqd' => "no",
82 'deft' => "" }
83 ];
84
85my $options = { 'name' => "ISISPlug",
86 'desc' => "{ISISPlug.desc}",
87 'abstract' => "no",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92# This plugin processes files with the suffix ".mst"
93sub get_default_process_exp {
94 return q^(?i)(\.mst)$^;
95}
96
97
98# This plugin blocks files with the suffix ".fdt" and ".xrf"
99sub get_default_block_exp {
100 return q^(?i)(\.fdt|\.xrf)$^;
101}
102
103
104# This plugin splits the input text at the "----------" lines
105sub get_default_split_exp {
106 return q^\n----------\n^;
107}
108
109
110sub new {
111 my $class = shift(@_);
112
113 my $self = new SplitPlug($class, @_);
114 if (!parsargv::parse(\@_,
115 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
116 q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
117 q^document_field/.*/^, \$self->{'document_field'},
118 q^document_prefix/.*/^, \$self->{'document_prefix'},
119 q^document_suffix/.*/^, \$self->{'document_suffix'},
120 "allow_extra_options")) {
121 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
122 die "\n";
123 }
124
125 # To allow for proper inheritance of arguments
126 my $option_list = $self->{'option_list'};
127 push(@{$option_list}, $options);
128 $self->{'plugin_type'} = "ISISPlug";
129
130 return bless $self, $class;
131}
132
133
134sub read
135{
136 my $self = shift(@_);
137 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
138
139 my $result = &SplitPlug::read($self, @_);
140 if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) {
141 &end_metadata_xml_file($self->{'documents_metadata_xml_file'});
142 }
143
144 return $result;
145}
146
147
148sub read_file {
149 my $self = shift (@_);
150 my ($filename, $encoding, $language, $textref) = @_;
151
152 my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
153
154 # Check the associated .fdt and .xrf files exist
155 # These files must have a lowercase extension for the IsisGdl program to work
156 # Bailing out because of this is kind of crappy but it is only an issue on Unix
157 my $fdtfilename = $databasename . ".fdt";
158 if (! -e $fdtfilename) {
159 die "Error: Could not find ISIS FDT file $fdtfilename.\n";
160 }
161 my $xrffilename = $databasename . ".xrf";
162 if (! -e $xrffilename) {
163 die "Error: Could not find ISIS XRF file $xrffilename.\n";
164 }
165
166 # The text to split is exported from the database by the IsisGdl program
167 open(FILE, "IsisGdl \"$filename\" |");
168
169 my $reader = new multiread();
170 $reader->set_handle ('ISISPlug::FILE');
171 $reader->set_encoding ($encoding);
172 $reader->read_file ($textref);
173
174 close(FILE);
175
176 # Parse the associated ISIS database Field Definition Table file (.fdt)
177 my %fdtmapping = &parse_field_definition_table($fdtfilename);
178
179 # Map the tag numbers to tag names, using the FDT mapping
180 $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
181
182 # Remove the line at the start so it is split and processed properly
183 $$textref =~ s/^----------\n//;
184
185 # Obtain the documents specified in the CDS/ISIS database, if requested
186 if ($self->{'document_field'}) {
187 # Create a directory to store the document files
188 $self->{'documents_directory'} = $databasename . ".all";
189 if (-e $self->{'documents_directory'}) {
190 &util::rm_r($self->{'documents_directory'});
191 }
192 &util::mk_dir($self->{'documents_directory'});
193
194 # ...and a metadata.xml file for the document metadata (extracted from the database)
195 $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");
196 if (-e $self->{'documents_metadata_xml_file'}) {
197 &util::rm($self->{'documents_metadata_xml_file'});
198 }
199 &begin_metadata_xml_file($self->{'documents_metadata_xml_file'});
200 }
201}
202
203
204sub process
205{
206 my $self = shift (@_);
207 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
208 my $outhandle = $self->{'outhandle'};
209
210 my $subfield_separator = $self->{'subfield_separator'};
211 my $entry_separator = $self->{'entry_separator'};
212
213 # Report that we're processing the file
214 print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
215 print $outhandle "IsisPlug: processing $file\n"
216 if ($self->{'verbosity'}) > 1;
217
218 # Process each line of the ISIS record, one at a time
219 foreach $line (split(/\n/, $$textref)) {
220 $line =~ /^tag=(.+) data=(.+)$/;
221 my $rawtagname = $1;
222 my $rawtagdata = $2;
223 # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
224
225 # Metadata field names: title case, then remove spaces
226 my $tagname = "";
227 foreach $word (split(/\s+/, $rawtagname)) {
228 substr($word, 0, 1) =~ tr/a-z/A-Z/;
229 $tagname .= $word;
230 }
231
232 # Make sure there is nothing bad in the tag names
233 $tagname =~ s/&//g;
234
235 # Handle each piece of metadata ('%' separated)
236 my $completetagvalue = "";
237 foreach $rawtagvalue (split(/%/, $rawtagdata)) {
238 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
239
240 # Metadata field values: take care with subfields
241 my $completeentryvalue = "";
242 while ($rawtagvalue ne "") {
243 # If there is a subfield specifier, parse it off
244 my $subfieldname = "";
245 if ($rawtagvalue =~ s/^\^([a-z])//) {
246 $subfieldname = "." . $1;
247 }
248
249 # Parse the metadata value off
250 $rawtagvalue =~ s/^([^\^]*)//;
251 my $metadatafieldname = $tagname . $subfieldname;
252 my $metadatafieldvalue = $1;
253 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
254
255 # Handle Keywords specially
256 if ($metadatafieldname eq "Keywords") {
257 my $keywordmetadatavalue = $metadatafieldvalue;
258 my $keywordlist = "";
259 while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
260 my $keyword = $1;
261 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
262 $keywordlist .= ", " unless ($keywordlist eq "");
263 $keywordlist .= $keyword;
264 }
265
266 $metadatafieldvalue = $keywordlist;
267 }
268
269 else {
270 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
271 }
272
273 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
274 $completeentryvalue .= $metadatafieldvalue;
275 }
276
277 $completetagvalue .= $completeentryvalue;
278 }
279
280 # print "Metadata: $tagname.all -> $completetagvalue\n";
281 $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
282 }
283 # print "\n";
284
285 # Add the full record as the document text
286 $$textref =~ s/\</&lt;/g;
287 $$textref =~ s/\>/&gt;/g;
288 $doc_obj->add_utf8_text ($cursection, $$textref);
289
290 # Obtain the documents specified in the CDS/ISIS database, if requested
291 if ($self->{'document_field'}) {
292 my $document_field = $self->{'document_field'};
293 my $document_prefix = $self->{'document_prefix'} || "";
294 my $document_suffix = $self->{'document_suffix'} || "";
295
296 my $documents_directory = $self->{'documents_directory'};
297 my $document_obtained = 0;
298
299 # Look at all the metadata assigned to this record
300 my $record_metadata = $doc_obj->get_all_metadata($cursection);
301 foreach my $pair (@$record_metadata) {
302 my ($field, $value) = (@$pair);
303
304 # Does this metadata element specify a document to obtain?
305 if ($field eq $document_field) {
306 my $document_file_full = $document_prefix . $value . $document_suffix;
307
308 my $document_file = &obtain_document($self, $document_file_full, $documents_directory);
309 if ($document_file) {
310 $document_obtained = 1;
311 &write_metadata_xml_file($self->{'documents_metadata_xml_file'},
312 $document_file, $record_metadata);
313 }
314 }
315 }
316
317 # If there was a document obtained for this record we don't want the record as well
318 if ($document_obtained) {
319 return 0;
320 }
321 }
322
323 # Record was processed successfully (and there was no document obtained)
324 return 1;
325}
326
327
328sub parse_field_definition_table
329{
330 my $fdtfilename = shift(@_);
331
332 my %fdtmapping = ();
333
334 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
335
336 my $amongstdefinitions = 0;
337 foreach $fdtfileline (<FDT_FILE>) {
338 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
339
340 if ($amongstdefinitions) {
341 my $fieldtitle = substr($fdtfileline, 0, 30);
342 my $fieldsubfields = substr($fdtfileline, 30, 20);
343 my $fieldspecs = substr($fdtfileline, 50);
344
345 # Remove extra spaces
346 $fieldtitle =~ s/(\s*)$//;
347 $fieldsubfields =~ s/(\s*)$//;
348
349 # Map from tag number to metadata field title and subfields
350 my $fieldtag = (split(/ /, $fieldspecs))[0];
351 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
352 'subfields' => $fieldsubfields };
353 }
354 elsif ($fdtfileline eq "***") {
355 $amongstdefinitions = 1;
356 }
357 }
358
359 close(FDT_FILE);
360
361 return %fdtmapping;
362}
363
364
365sub obtain_document
366{
367 my $self = shift(@_);
368 my $document_file_full = shift(@_);
369 my $documents_directory = shift(@_);
370
371 my $outhandle = $self->{'outhandle'};
372 print $outhandle "Obtaining document file $document_file_full...\n"
373 if ($self->{'verbosity'} > 1);
374
375 my $document_file_name;
376 my $local_document_file;
377
378 # Document specified is on the web
379 if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
380 $document_file_full =~ /([^\/]+)$/;
381 $document_file_name = $1;
382 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
383
384 my $wget_options = "--quiet";
385 $wget_options = "--verbose" if ($self->{'verbosity'} > 2);
386 $wget_options .= " --timestamping"; # Only re-download files if they're newer
387 `wget $wget_options $document_file_full --output-document $local_document_file`;
388 }
389 # Document specified is on the disk
390 else {
391 my $dir_sep = &util::get_os_dirsep();
392 $document_file_full =~ /(.+$dir_sep)?(.*)$/;
393 $document_file_name = $2;
394 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
395
396 &util::cp($document_file_full, $documents_directory);
397 }
398
399 # Check the document was obtained successfully
400 if (!-e $local_document_file) {
401 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
402 return undef;
403 }
404
405 return $document_file_name;
406}
407
408
409sub begin_metadata_xml_file
410{
411 my $metadata_xml_file = shift(@_);
412
413 open(METADATA_XML_FILE, ">$metadata_xml_file");
414 print METADATA_XML_FILE
415 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
416 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
417 "<DirectoryMetadata>\n";
418 close(METADATA_XML_FILE);
419}
420
421
422sub write_metadata_xml_file
423{
424 my $metadata_xml_file = shift(@_);
425 my $file_name = shift(@_);
426 my $record_metadata = shift(@_);
427
428 # Make $file_name XML-safe
429 $file_name =~ s/</&lt;/g;
430 $file_name =~ s/>/&gt;/g;
431
432 open(METADATA_XML_FILE, ">>$metadata_xml_file");
433
434 print METADATA_XML_FILE
435 "\n" .
436 " <FileSet>\n" .
437 " <FileName>$file_name</FileName>\n" .
438 " <Description>\n";
439
440 foreach my $pair (@$record_metadata) {
441 my ($field, $value) = (@$pair);
442
443 # We're only interested in metadata from the database
444 next if ($field eq "gsdlsourcefilename");
445 next if ($field eq "gsdldoctype");
446 next if ($field eq "Language");
447 next if ($field eq "Encoding");
448 next if ($field eq "Identifier");
449 next if ($field eq "Source");
450 next if ($field eq "SourceSegment");
451 next if ($field eq "Plugin");
452
453 # Make $value XML-safe
454 $value =~ s/</&lt;/g;
455 $value =~ s/>/&gt;/g;
456
457 print METADATA_XML_FILE " <Metadata name=\"$field\">$value</Metadata>\n";
458 }
459
460 print METADATA_XML_FILE
461 " </Description>\n" .
462 " </FileSet>\n";
463
464 close(METADATA_XML_FILE);
465}
466
467
468sub end_metadata_xml_file
469{
470 my $metadata_xml_file = shift(@_);
471
472 open(METADATA_XML_FILE, ">>$metadata_xml_file");
473 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
474 close(METADATA_XML_FILE);
475}
476
477
4781;
Note: See TracBrowser for help on using the repository browser.