source: trunk/gsdl/perllib/plugins/ProCitePlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# ProCitePlug.pm -- A plugin for (exported) ProCite databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProCitePlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ProCitePlug is a sub-class of SplitPlug
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() },
46 { 'name' => "split_exp",
47 'desc' => "{SplitPlug.split_exp}",
48 'type' => "regexp",
49 'deft' => &get_default_split_exp(),
50 'reqd' => "no" }
51 ];
52
53my $options = { 'name' => "ProCitePlug",
54 'desc' => "{ProCitePlug.desc}",
55 'abstract' => "no",
56 'inherits' => "yes",
57 'args' => $arguments };
58
59
60# This plugin processes exported ProCite files with the suffix ".txt"
61sub get_default_process_exp
62{
63 return q^(?i)(\.txt)$^;
64}
65
66
67# This plugin splits the input text at every line
68sub get_default_split_exp
69{
70 return q^\n^;
71}
72
73
74sub new
75{
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
81 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
82
83 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
84
85 return bless $self, $class;
86}
87
88
89my %crazy_workform_mapping =
90 ( "A", "Book, Long Form",
91 "B", "Book, Short Form",
92 "C", "Journal, Long Form",
93 "D", "Journal, Short Form",
94 "E", "Report",
95 "F", "Newspaper",
96 "G", "Dissertation",
97 "H", "Trade Catalog",
98 "I", "Letter (Correspondence)",
99 "J", "Manuscript",
100 "K", "Conference Proceedings",
101 "L", "Map",
102 "M", "Music Score",
103 "N", "Sound Recording",
104 "O", "Motion Picture",
105 "P", "Audiovisual Material",
106 "Q", "Video Recording",
107 "R", "Art Work",
108 "S", "Computer Program",
109 "T", "Data File" );
110
111
112sub read_file
113{
114 my $self = shift (@_);
115 my ($filename, $encoding, $language, $textref) = @_;
116
117 # Store the workform definitions for this file
118 my %workform_definitions = ();
119
120 # Read the contents of the file into $textref
121 open(PROCITE_FILE, "<$filename");
122 my $reader = new multiread();
123 $reader->set_handle ('ProCitePlug::PROCITE_FILE');
124 $reader->set_encoding ($encoding);
125 $reader->read_file ($textref);
126 close(PROCITE_FILE);
127
128 # Read the workform definitions at the start of the file
129 while ($$textref =~ /^\<Workform Definition\>/) {
130 # Remove the workform definition line so it is not processed later as a record
131 $$textref =~ s/^\<Workform Definition\>(.*)\n//;
132 my $workform_definition = $1;
133
134 # Parse the workform definitions and store them for later
135 $workform_definition =~ s/^\"([^\"]*)\",//;
136 my $workform_name = $1;
137
138 my @workform_values;
139 while ($workform_definition !~ /^$/) {
140 $workform_definition =~ s/^\"([^\"]*)\",?//;
141 my $workform_field = $1;
142 push(@workform_values, $workform_field);
143 }
144
145 # Remember this workform definition for when we're reading the records
146 $workform_definitions{$workform_name} = \@workform_values;
147 }
148
149 $self->{'workform_definitions'}->{$filename} = \%workform_definitions;
150}
151
152
153sub process
154{
155 my $self = shift (@_);
156 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
157
158 my $outhandle = $self->{'outhandle'};
159 my $filename = &util::filename_cat($base_dir, $file);
160
161 # Report that we're processing the file
162 print STDERR "<Processing n='$file' p='ProCitePlug'>\n" if ($gli);
163 print $outhandle "ProCitePlug: processing $file\n"
164 if ($self->{'verbosity'}) > 1;
165
166 # Build up an HTML view of the record for easy display at run-time
167 my $html_record = "";
168
169 # Read the record's workform indicator and record number
170 $$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//;
171 my $workform_indicator = $1;
172 my $recordnum = $2;
173
174 # If necessary, map the workform indicator into something useful
175 if ($crazy_workform_mapping{$workform_indicator}) {
176 $workform_indicator = $crazy_workform_mapping{$workform_indicator};
177 }
178
179 # Check we know about the workform of this record
180 my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
181 if (!$workform_definitions{$workform_indicator}) {
182 print STDERR "Unknown workform!\n";
183 return 0;
184 }
185
186 # Store the full record as the document text
187 $doc_obj->add_utf8_text($cursection, $$textref);
188
189 # Store workform and record number as metadata
190 $doc_obj->add_utf8_metadata($cursection, "pc.Workform", $workform_indicator);
191 $doc_obj->add_utf8_metadata($cursection, "pc.RecordNumber", $recordnum);
192
193 # Store FileFormat metadata
194 $doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
195
196 $html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
197
198 my @workform_values = @{$workform_definitions{$workform_indicator}};
199
200 # Read each field (surrounded by quotes) of the record
201 my $fieldnum = 0;
202 while ($$textref !~ /^$/) {
203 $$textref =~ s/^\"([^\"]*)\",?//;
204 my $field_value_raw = $1;
205
206 # Add non-empty metadata values to the document
207 unless ($field_value_raw eq "") {
208 # Add the display name of the metadata field for format statement convenience
209 my $field_name = $workform_values[$fieldnum];
210 unless ($field_name eq "---") {
211 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Name";
212 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
213 }
214
215 $html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
216
217 # Multiple metadata values are separated with "//"
218 foreach $field_value (split(/\/\//, $field_value_raw)) {
219 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Value";
220 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
221
222 $html_record .= $field_value . "<br>";
223 }
224
225 $html_record .= "</td></tr>";
226 }
227
228 $fieldnum++;
229 }
230
231 # Store HTML view of record as metadata
232 $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
233
234 # Record was processed successfully
235 return 1;
236}
237
238
2391;
Note: See TracBrowser for help on using the repository browser.