source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/ProCitePlugin.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 7.6 KB
Line 
1###########################################################################
2#
3# ProCitePlugin.pm -- A plugin for (exported) ProCite databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProCitePlugin;
28
29
30use multiread;
31use SplitTextFile;
32use MetadataRead;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37# ProCitePlugin is a sub-class of SplitTextFile
38sub BEGIN {
39 @ProCitePlugin::ISA = ('MetadataRead', 'SplitTextFile');
40}
41
42
43my $arguments =
44 [ { 'name' => "process_exp",
45 'desc' => "{BasePlugin.process_exp}",
46 'type' => "regexp",
47 'reqd' => "no",
48 'deft' => &get_default_process_exp() },
49 { 'name' => "split_exp",
50 'desc' => "{SplitTextFile.split_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_split_exp(),
53 'reqd' => "no" },
54
55 # The interesting options
56 { 'name' => "entry_separator",
57 'desc' => "{ProCitePlugin.entry_separator}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "//" },
61 ];
62
63my $options = { 'name' => "ProCitePlugin",
64 'desc' => "{ProCitePlugin.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'explodes' => "yes",
68 'args' => $arguments };
69
70
71# This plugin processes exported ProCite files with the suffix ".txt"
72sub get_default_process_exp
73{
74 return q^(?i)(\.txt)$^;
75}
76
77
78# This plugin splits the input text at every line
79sub get_default_split_exp
80{
81 return q^\n^;
82}
83
84
85sub new
86{
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
95
96 return bless $self, $class;
97}
98
99
100my %crazy_workform_mapping =
101 ( "A", "Book, Long Form",
102 "B", "Book, Short Form",
103 "C", "Journal, Long Form",
104 "D", "Journal, Short Form",
105 "E", "Report",
106 "F", "Newspaper",
107 "G", "Dissertation",
108 "H", "Trade Catalog",
109 "I", "Letter (Correspondence)",
110 "J", "Manuscript",
111 "K", "Conference Proceedings",
112 "L", "Map",
113 "M", "Music Score",
114 "N", "Sound Recording",
115 "O", "Motion Picture",
116 "P", "Audiovisual Material",
117 "Q", "Video Recording",
118 "R", "Art Work",
119 "S", "Computer Program",
120 "T", "Data File" );
121
122
123sub read_file
124{
125 my $self = shift (@_);
126 my ($filename, $encoding, $language, $textref) = @_;
127
128 # Store the workform definitions for this file
129 my %workform_definitions = ();
130
131 # Read the contents of the file into $textref
132 open(PROCITE_FILE, "<$filename");
133 my $reader = new multiread();
134 $reader->set_handle ('ProCitePlugin::PROCITE_FILE');
135 $reader->set_encoding ($encoding);
136 $reader->read_file ($textref);
137 close(PROCITE_FILE);
138
139 # Read the workform definitions at the start of the file
140 while ($$textref =~ /^\<Workform Definition\>/) {
141 # Remove the workform definition line so it is not processed later as a record
142 $$textref =~ s/^\<Workform Definition\>(.*)\n//;
143 my $workform_definition = $1;
144 # Parse the workform definitions and store them for later
145 $workform_definition =~ s/^\"([^\"]*)\",//;
146 my $workform_name = $1;
147 my @workform_values;
148 while ($workform_definition !~ /^\s*$/) {
149 $workform_definition =~ s/^\"([^\"]*)\",?//;
150 my $workform_field = $1;
151 push(@workform_values, $workform_field);
152 }
153
154 # Remember this workform definition for when we're reading the records
155 $workform_definitions{$workform_name} = \@workform_values;
156 }
157
158 $self->{'workform_definitions'}->{$filename} = \%workform_definitions;
159}
160
161
162sub process
163{
164 my $self = shift (@_);
165 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
166
167 my $outhandle = $self->{'outhandle'};
168 my $filename = &util::filename_cat($base_dir, $file);
169 my $cursection = $doc_obj->get_top_section();
170
171 # Build up an HTML view of the record for easy display at run-time
172 my $html_record = "<table>";
173
174 # Read the record's workform indicator and record number
175 #$$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//;
176 $$textref =~ s/^\"([^\"]*)\",//;
177 my $workform_indicator = $1;
178
179 # some procite files have a record number next
180
181 my $recordnum = $$textref =~ s/^\"(\d*)\",//;
182 $recordnum = "undefined" unless defined $recordnum;
183
184 # If necessary, map the workform indicator into something useful
185 if ($crazy_workform_mapping{$workform_indicator}) {
186 $workform_indicator = $crazy_workform_mapping{$workform_indicator};
187 }
188
189 # Check we know about the workform of this record
190 my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
191 if (!$workform_definitions{$workform_indicator}) {
192 print STDERR "Unknown workform $workform_indicator!\n";
193 return 0;
194 }
195
196 # Store the full record as the document text
197 $doc_obj->add_utf8_text($cursection, $$textref);
198
199 # Store workform and record number as metadata
200 $doc_obj->add_utf8_metadata($cursection, "Workform", $workform_indicator);
201 $doc_obj->add_utf8_metadata($cursection, "RecordNumber", $recordnum);
202
203 # Store FileFormat metadata
204 $doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
205
206 $html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
207
208 my @workform_values = @{$workform_definitions{$workform_indicator}};
209
210 # Read each field (surrounded by quotes) of the record
211 my $fieldnum = 0;
212 while ($$textref !~ /^\s*$/) {
213 $$textref =~ s/^\"([^\"]*)\",?//;
214 my $field_value_raw = $1;
215
216 # Add non-empty metadata values to the document
217 unless ($field_value_raw eq "") {
218 # Add the display name of the metadata field for format statement convenience
219 my $field_name = $workform_values[$fieldnum];
220 #unless ($field_name eq "---") {
221 # my $meta_name = "Field" . ($fieldnum + 1) . "Name";
222 # $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
223 # }
224 if ($field_name eq "---") {
225 $field_name = "Field" . ($fieldnum + 1);
226 }
227 $html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
228
229 # Multiple metadata values are separated with "//"
230 #foreach my $field_value (split(/\/\//, $field_value_raw)) {
231 foreach my $field_value (split($self->{'entry_separator'}, $field_value_raw)) {
232 #my $meta_name = "Field" . ($fieldnum + 1) . "Value";
233 #$doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
234 $doc_obj->add_utf8_metadata($cursection, $field_name, $field_value);
235 $html_record .= $field_value . "<br>";
236 }
237
238 $html_record .= "</td></tr>";
239 }
240
241 $fieldnum++;
242 }
243
244 $html_record .= "</table>";
245 # Store HTML view of record as metadata
246 $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
247
248 # Record was processed successfully
249 return 1;
250}
251
252
2531;
Note: See TracBrowser for help on using the repository browser.