source: gsdl/trunk/perllib/plugins/ProCitePlugin.pm@ 15865

Last change on this file since 15865 was 15865, checked in by kjdon, 16 years ago

renaming plugins in preparation for my plugin overhaul

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# ProCitePlug.pm -- A plugin for (exported) ProCite databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProCitePlug;
28
29
30use multiread;
31use SplitPlug;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ProCitePlug is a sub-class of SplitPlug
37sub BEGIN {
38 @ProCitePlug::ISA = ('SplitPlug');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasPlug.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "split_exp",
49 'desc' => "{SplitPlug.split_exp}",
50 'type' => "regexp",
51 'deft' => &get_default_split_exp(),
52 'reqd' => "no" }
53 ];
54
55my $options = { 'name' => "ProCitePlug",
56 'desc' => "{ProCitePlug.desc}",
57 'abstract' => "no",
58 'inherits' => "yes",
59 'explodes' => "yes",
60 'args' => $arguments };
61
62
63# This plugin processes exported ProCite files with the suffix ".txt"
64sub get_default_process_exp
65{
66 return q^(?i)(\.txt)$^;
67}
68
69
70# This plugin splits the input text at every line
71sub get_default_split_exp
72{
73 return q^\n^;
74}
75
76
77sub new
78{
79 my ($class) = shift (@_);
80 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81 push(@$pluginlist, $class);
82
83 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
84 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
85
86 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
87
88 return bless $self, $class;
89}
90
91
92my %crazy_workform_mapping =
93 ( "A", "Book, Long Form",
94 "B", "Book, Short Form",
95 "C", "Journal, Long Form",
96 "D", "Journal, Short Form",
97 "E", "Report",
98 "F", "Newspaper",
99 "G", "Dissertation",
100 "H", "Trade Catalog",
101 "I", "Letter (Correspondence)",
102 "J", "Manuscript",
103 "K", "Conference Proceedings",
104 "L", "Map",
105 "M", "Music Score",
106 "N", "Sound Recording",
107 "O", "Motion Picture",
108 "P", "Audiovisual Material",
109 "Q", "Video Recording",
110 "R", "Art Work",
111 "S", "Computer Program",
112 "T", "Data File" );
113
114
115sub read_file
116{
117 my $self = shift (@_);
118 my ($filename, $encoding, $language, $textref) = @_;
119
120 # Store the workform definitions for this file
121 my %workform_definitions = ();
122
123 # Read the contents of the file into $textref
124 open(PROCITE_FILE, "<$filename");
125 my $reader = new multiread();
126 $reader->set_handle ('ProCitePlug::PROCITE_FILE');
127 $reader->set_encoding ($encoding);
128 $reader->read_file ($textref);
129 close(PROCITE_FILE);
130
131 # Read the workform definitions at the start of the file
132 while ($$textref =~ /^\<Workform Definition\>/) {
133 # Remove the workform definition line so it is not processed later as a record
134 $$textref =~ s/^\<Workform Definition\>(.*)\n//;
135 my $workform_definition = $1;
136 # Parse the workform definitions and store them for later
137 $workform_definition =~ s/^\"([^\"]*)\",//;
138 my $workform_name = $1;
139 my @workform_values;
140 while ($workform_definition !~ /^\s*$/) {
141 $workform_definition =~ s/^\"([^\"]*)\",?//;
142 my $workform_field = $1;
143 push(@workform_values, $workform_field);
144 }
145
146 # Remember this workform definition for when we're reading the records
147 $workform_definitions{$workform_name} = \@workform_values;
148 }
149
150 $self->{'workform_definitions'}->{$filename} = \%workform_definitions;
151}
152
153
154sub process
155{
156 my $self = shift (@_);
157 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
158
159 my $outhandle = $self->{'outhandle'};
160 my $filename = &util::filename_cat($base_dir, $file);
161 my $cursection = $doc_obj->get_top_section();
162 # Report that we're processing the file
163 print STDERR "<Processing n='$file' p='ProCitePlug'>\n" if ($gli);
164 print $outhandle "ProCitePlug: processing $file\n"
165 if ($self->{'verbosity'}) > 1;
166
167 # Build up an HTML view of the record for easy display at run-time
168 my $html_record = "<table>";
169
170 # Read the record's workform indicator and record number
171 $$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//;
172 my $workform_indicator = $1;
173 my $recordnum = $2;
174
175 # If necessary, map the workform indicator into something useful
176 if ($crazy_workform_mapping{$workform_indicator}) {
177 $workform_indicator = $crazy_workform_mapping{$workform_indicator};
178 }
179
180 # Check we know about the workform of this record
181 my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
182 if (!$workform_definitions{$workform_indicator}) {
183 print STDERR "Unknown workform $workform_indicator!\n";
184 return 0;
185 }
186
187 # Store the full record as the document text
188 $doc_obj->add_utf8_text($cursection, $$textref);
189
190 # Store workform and record number as metadata
191 $doc_obj->add_utf8_metadata($cursection, "pc.Workform", $workform_indicator);
192 $doc_obj->add_utf8_metadata($cursection, "pc.RecordNumber", $recordnum);
193
194 # Store FileFormat metadata
195 $doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
196
197 $html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
198
199 my @workform_values = @{$workform_definitions{$workform_indicator}};
200
201 # Read each field (surrounded by quotes) of the record
202 my $fieldnum = 0;
203 while ($$textref !~ /^\s*$/) {
204 $$textref =~ s/^\"([^\"]*)\",?//;
205 my $field_value_raw = $1;
206
207 # Add non-empty metadata values to the document
208 unless ($field_value_raw eq "") {
209 # Add the display name of the metadata field for format statement convenience
210 my $field_name = $workform_values[$fieldnum];
211 unless ($field_name eq "---") {
212 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Name";
213 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
214 }
215
216 $html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
217
218 # Multiple metadata values are separated with "//"
219 foreach my $field_value (split(/\/\//, $field_value_raw)) {
220 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Value";
221 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
222
223 $html_record .= $field_value . "<br>";
224 }
225
226 $html_record .= "</td></tr>";
227 }
228
229 $fieldnum++;
230 }
231
232 $html_record .= "</table>";
233 # Store HTML view of record as metadata
234 $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
235
236 # Record was processed successfully
237 return 1;
238}
239
240
2411;
Note: See TracBrowser for help on using the repository browser.