########################################################################### # # ProCitePlugin.pm -- A plugin for (exported) ProCite databases # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright 1999-2004 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package ProCitePlugin; use multiread; use SplitTextFile; use MetadataRead; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa # ProCitePlugin is a sub-class of SplitTextFile sub BEGIN { @ProCitePlugin::ISA = ('MetadataRead', 'SplitTextFile'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() }, { 'name' => "split_exp", 'desc' => "{SplitTextFile.split_exp}", 'type' => "regexp", 'deft' => &get_default_split_exp(), 'reqd' => "no" }, # The interesting options { 'name' => "entry_separator", 'desc' => "{ProCitePlugin.entry_separator}", 'type' => "string", 'reqd' => "no", 'deft' => "//" }, ]; my $options = { 'name' => "ProCitePlugin", 'desc' => "{ProCitePlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'explodes' => "yes", 'args' => $arguments }; # This plugin processes exported ProCite files with the suffix ".txt" sub get_default_process_exp { return q^(?i)(\.txt)$^; } # This plugin splits the input text at every line sub get_default_split_exp { return q^\n^; } sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } my %crazy_workform_mapping = ( "A", "Book, Long Form", "B", "Book, Short Form", "C", "Journal, Long Form", "D", "Journal, Short Form", "E", "Report", "F", "Newspaper", "G", "Dissertation", "H", "Trade Catalog", "I", "Letter (Correspondence)", "J", "Manuscript", "K", "Conference Proceedings", "L", "Map", "M", "Music Score", "N", "Sound Recording", "O", "Motion Picture", "P", "Audiovisual Material", "Q", "Video Recording", "R", "Art Work", "S", "Computer Program", "T", "Data File" ); sub read_file { my $self = shift (@_); my ($filename, $encoding, $language, $textref) = @_; # Store the workform definitions for this file my %workform_definitions = (); # Read the contents of the file into $textref open(PROCITE_FILE, "<$filename"); my $reader = new multiread(); $reader->set_handle ('ProCitePlugin::PROCITE_FILE'); $reader->set_encoding ($encoding); $reader->read_file ($textref); close(PROCITE_FILE); # Read the workform definitions at the start of the file while ($$textref =~ /^\/) { # Remove the workform definition line so it is not processed later as a record $$textref =~ s/^\(.*)\n//; my $workform_definition = $1; # Parse the workform definitions and store them for later $workform_definition =~ s/^\"([^\"]*)\",//; my $workform_name = $1; my @workform_values; while ($workform_definition !~ /^\s*$/) { $workform_definition =~ s/^\"([^\"]*)\",?//; my $workform_field = $1; push(@workform_values, $workform_field); } # Remember this workform definition for when we're reading the records $workform_definitions{$workform_name} = \@workform_values; } $self->{'workform_definitions'}->{$filename} = \%workform_definitions; } sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $outhandle = $self->{'outhandle'}; my $filename = &util::filename_cat($base_dir, $file); my $cursection = $doc_obj->get_top_section(); # Build up an HTML view of the record for easy display at run-time my $html_record = ""; # Read the record's workform indicator and record number #$$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//; $$textref =~ s/^\"([^\"]*)\",//; my $workform_indicator = $1; # some procite files have a record number next my $recordnum = $$textref =~ s/^\"(\d*)\",//; $recordnum = "undefined" unless defined $recordnum; # If necessary, map the workform indicator into something useful if ($crazy_workform_mapping{$workform_indicator}) { $workform_indicator = $crazy_workform_mapping{$workform_indicator}; } # Check we know about the workform of this record my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}}; if (!$workform_definitions{$workform_indicator}) { print STDERR "Unknown workform $workform_indicator!\n"; return 0; } # Store the full record as the document text $doc_obj->add_utf8_text($cursection, $$textref); # Store workform and record number as metadata $doc_obj->add_utf8_metadata($cursection, "Workform", $workform_indicator); $doc_obj->add_utf8_metadata($cursection, "RecordNumber", $recordnum); # Store FileFormat metadata $doc_obj->add_metadata($cursection, "FileFormat", "ProCite"); $html_record .= ""; my @workform_values = @{$workform_definitions{$workform_indicator}}; # Read each field (surrounded by quotes) of the record my $fieldnum = 0; while ($$textref !~ /^\s*$/) { $$textref =~ s/^\"([^\"]*)\",?//; my $field_value_raw = $1; # Add non-empty metadata values to the document unless ($field_value_raw eq "") { # Add the display name of the metadata field for format statement convenience my $field_name = $workform_values[$fieldnum]; #unless ($field_name eq "---") { # my $meta_name = "Field" . ($fieldnum + 1) . "Name"; # $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name); # } if ($field_name eq "---") { $field_name = "Field" . ($fieldnum + 1); } $html_record .= ""; } $fieldnum++; } $html_record .= "
Record Number: $recordnum
$field_name: "; # Multiple metadata values are separated with "//" #foreach my $field_value (split(/\/\//, $field_value_raw)) { foreach my $field_value (split($self->{'entry_separator'}, $field_value_raw)) { #my $meta_name = "Field" . ($fieldnum + 1) . "Value"; #$doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value); $doc_obj->add_utf8_metadata($cursection, $field_name, $field_value); $html_record .= $field_value . "
"; } $html_record .= "
"; # Store HTML view of record as metadata $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record); # Record was processed successfully return 1; } 1;