source: trunk/gsdl/perllib/plugins/ProCitePlug.pm@ 9494

Last change on this file since 9494 was 9494, checked in by mdewsnip, 19 years ago

A couple more fixes for the metadata_read stuff.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# ProCitePlug.pm -- A plugin for (exported) ProCite databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProCitePlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ProCitePlug is a sub-class of SplitPlug
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() },
46 { 'name' => "split_exp",
47 'desc' => "{SplitPlug.split_exp}",
48 'type' => "regexp",
49 'deft' => &get_default_split_exp(),
50 'reqd' => "no" }
51 ];
52
53my $options = { 'name' => "ProCitePlug",
54 'desc' => "{ProCitePlug.desc}",
55 'abstract' => "no",
56 'inherits' => "yes",
57 'args' => $arguments };
58
59
60# This plugin processes exported ProCite files with the suffix ".txt"
61sub get_default_process_exp
62{
63 return q^(?i)(\.txt)$^;
64}
65
66
67# This plugin splits the input text at every line
68sub get_default_split_exp
69{
70 return q^\n^;
71}
72
73
74sub new
75{
76 my $class = shift(@_);
77
78 my $self = new SplitPlug($class, @_);
79 if (!parsargv::parse(\@_,
80 "allow_extra_options")) {
81 die "\nIncorrect options passed to ProCitePlug, check your collect.cfg configuration file\n";
82 }
83
84 # To allow for proper inheritance of arguments
85 my $option_list = $self->{'option_list'};
86 push(@{$option_list}, $options);
87 $self->{'plugin_type'} = "ProCitePlug";
88
89 return bless $self, $class;
90}
91
92
93my %crazy_workform_mapping =
94 ( "A", "Book, Long Form",
95 "B", "Book, Short Form",
96 "C", "Journal, Long Form",
97 "D", "Journal, Short Form",
98 "E", "Report",
99 "F", "Newspaper",
100 "G", "Dissertation",
101 "H", "Trade Catalog",
102 "I", "Letter (Correspondence)",
103 "J", "Manuscript",
104 "K", "Conference Proceedings",
105 "L", "Map",
106 "M", "Music Score",
107 "N", "Sound Recording",
108 "O", "Motion Picture",
109 "P", "Audiovisual Material",
110 "Q", "Video Recording",
111 "R", "Art Work",
112 "S", "Computer Program",
113 "T", "Data File" );
114
115
116sub read_file
117{
118 my $self = shift (@_);
119 my ($filename, $encoding, $language, $textref) = @_;
120
121 # Store the workform definitions for this file
122 my %workform_definitions = ();
123
124 # Read the contents of the file into $textref
125 open(PROCITE_FILE, "<$filename");
126 my $reader = new multiread();
127 $reader->set_handle ('ProCitePlug::PROCITE_FILE');
128 $reader->set_encoding ($encoding);
129 $reader->read_file ($textref);
130 close(PROCITE_FILE);
131
132 # Read the workform definitions at the start of the file
133 while ($$textref =~ /^\<Workform Definition\>/) {
134 # Remove the workform definition line so it is not processed later as a record
135 $$textref =~ s/^\<Workform Definition\>(.*)\n//;
136 my $workform_definition = $1;
137
138 # Parse the workform definitions and store them for later
139 $workform_definition =~ s/^\"([^\"]*)\",//;
140 my $workform_name = $1;
141
142 my @workform_values;
143 while ($workform_definition !~ /^$/) {
144 $workform_definition =~ s/^\"([^\"]*)\",?//;
145 my $workform_field = $1;
146 push(@workform_values, $workform_field);
147 }
148
149 # Remember this workform definition for when we're reading the records
150 $workform_definitions{$workform_name} = \@workform_values;
151 }
152
153 $self->{'workform_definitions'}->{$filename} = \%workform_definitions;
154}
155
156
157sub process
158{
159 my $self = shift (@_);
160 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
161
162 my $outhandle = $self->{'outhandle'};
163 my $filename = &util::filename_cat($base_dir, $file);
164
165 # Report that we're processing the file
166 print STDERR "<Processing n='$file' p='ProCitePlug'>\n" if ($gli);
167 print $outhandle "ProCitePlug: processing $file\n"
168 if ($self->{'verbosity'}) > 1;
169
170 # Build up an HTML view of the record for easy display at run-time
171 my $html_record = "";
172
173 # Read the record's workform indicator and record number
174 $$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//;
175 my $workform_indicator = $1;
176 my $recordnum = $2;
177
178 # If necessary, map the workform indicator into something useful
179 if ($crazy_workform_mapping{$workform_indicator}) {
180 $workform_indicator = $crazy_workform_mapping{$workform_indicator};
181 }
182
183 # Check we know about the workform of this record
184 my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
185 if (!$workform_definitions{$workform_indicator}) {
186 print STDERR "Unknown workform!\n";
187 return 0;
188 }
189
190 # Store the full record as the document text
191 $doc_obj->add_utf8_text($cursection, $$textref);
192
193 # Store workform and record number as metadata
194 $doc_obj->add_utf8_metadata($cursection, "pc.Workform", $workform_indicator);
195 $doc_obj->add_utf8_metadata($cursection, "pc.RecordNumber", $recordnum);
196
197 # Store FileFormat metadata
198 $doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
199
200 $html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
201
202 my @workform_values = @{$workform_definitions{$workform_indicator}};
203
204 # Read each field (surrounded by quotes) of the record
205 my $fieldnum = 0;
206 while ($$textref !~ /^$/) {
207 $$textref =~ s/^\"([^\"]*)\",?//;
208 my $field_value_raw = $1;
209
210 # Add non-empty metadata values to the document
211 unless ($field_value_raw eq "") {
212 # Add the display name of the metadata field for format statement convenience
213 my $field_name = $workform_values[$fieldnum];
214 unless ($field_name eq "---") {
215 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Name";
216 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
217 }
218
219 $html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
220
221 # Multiple metadata values are separated with "//"
222 foreach $field_value (split(/\/\//, $field_value_raw)) {
223 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Value";
224 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
225
226 $html_record .= $field_value . "<br>";
227 }
228
229 $html_record .= "</td></tr>";
230 }
231
232 $fieldnum++;
233 }
234
235 # Store HTML view of record as metadata
236 $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
237
238 # Record was processed successfully
239 return 1;
240}
241
242
2431;
Note: See TracBrowser for help on using the repository browser.