Context Navigation

source: trunk/gsdl/perllib/plugins/ProCitePlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Line
1	###########################################################################
2	#
3	# ProCitePlug.pm -- A plugin for (exported) ProCite databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ProCitePlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33
34	# ProCitePlug is a sub-class of SplitPlug
35	sub BEGIN {
36	@ISA = ('SplitPlug');
37	}
38
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BasPlug.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() },
46	{ 'name' => "split_exp",
47	'desc' => "{SplitPlug.split_exp}",
48	'type' => "regexp",
49	'deft' => &get_default_split_exp(),
50	'reqd' => "no" }
51	];
52
53	my $options = { 'name' => "ProCitePlug",
54	'desc' => "{ProCitePlug.desc}",
55	'abstract' => "no",
56	'inherits' => "yes",
57	'args' => $arguments };
58
59
60	# This plugin processes exported ProCite files with the suffix ".txt"
61	sub get_default_process_exp
62	{
63	return q^(?i)(\.txt)$^;
64	}
65
66
67	# This plugin splits the input text at every line
68	sub get_default_split_exp
69	{
70	return q^\n^;
71	}
72
73
74	sub new
75	{
76	my ($class) = shift (@_);
77	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78	push(@$pluginlist, $class);
79
80	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
81	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
82
83	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
84
85	return bless $self, $class;
86	}
87
88
89	my %crazy_workform_mapping =
90	( "A", "Book, Long Form",
91	"B", "Book, Short Form",
92	"C", "Journal, Long Form",
93	"D", "Journal, Short Form",
94	"E", "Report",
95	"F", "Newspaper",
96	"G", "Dissertation",
97	"H", "Trade Catalog",
98	"I", "Letter (Correspondence)",
99	"J", "Manuscript",
100	"K", "Conference Proceedings",
101	"L", "Map",
102	"M", "Music Score",
103	"N", "Sound Recording",
104	"O", "Motion Picture",
105	"P", "Audiovisual Material",
106	"Q", "Video Recording",
107	"R", "Art Work",
108	"S", "Computer Program",
109	"T", "Data File" );
110
111
112	sub read_file
113	{
114	my $self = shift (@_);
115	my ($filename, $encoding, $language, $textref) = @_;
116
117	# Store the workform definitions for this file
118	my %workform_definitions = ();
119
120	# Read the contents of the file into $textref
121	open(PROCITE_FILE, "<$filename");
122	my $reader = new multiread();
123	$reader->set_handle ('ProCitePlug::PROCITE_FILE');
124	$reader->set_encoding ($encoding);
125	$reader->read_file ($textref);
126	close(PROCITE_FILE);
127
128	# Read the workform definitions at the start of the file
129	while ($$textref =~ /^\<Workform Definition\>/) {
130	# Remove the workform definition line so it is not processed later as a record
131	$$textref =~ s/^\<Workform Definition\>(.*)\n//;
132	my $workform_definition = $1;
133
134	# Parse the workform definitions and store them for later
135	$workform_definition =~ s/^\"([^\"]*)\",//;
136	my $workform_name = $1;
137
138	my @workform_values;
139	while ($workform_definition !~ /^$/) {
140	$workform_definition =~ s/^\"([^\"]*)\",?//;
141	my $workform_field = $1;
142	push(@workform_values, $workform_field);
143	}
144
145	# Remember this workform definition for when we're reading the records
146	$workform_definitions{$workform_name} = \@workform_values;
147	}
148
149	$self->{'workform_definitions'}->{$filename} = \%workform_definitions;
150	}
151
152
153	sub process
154	{
155	my $self = shift (@_);
156	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
157
158	my $outhandle = $self->{'outhandle'};
159	my $filename = &util::filename_cat($base_dir, $file);
160
161	# Report that we're processing the file
162	print STDERR "<Processing n='$file' p='ProCitePlug'>\n" if ($gli);
163	print $outhandle "ProCitePlug: processing $file\n"
164	if ($self->{'verbosity'}) > 1;
165
166	# Build up an HTML view of the record for easy display at run-time
167	my $html_record = "";
168
169	# Read the record's workform indicator and record number
170	$$textref =~ s/^\"([^\"])\",\"([^\"])\",//;
171	my $workform_indicator = $1;
172	my $recordnum = $2;
173
174	# If necessary, map the workform indicator into something useful
175	if ($crazy_workform_mapping{$workform_indicator}) {
176	$workform_indicator = $crazy_workform_mapping{$workform_indicator};
177	}
178
179	# Check we know about the workform of this record
180	my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
181	if (!$workform_definitions{$workform_indicator}) {
182	print STDERR "Unknown workform!\n";
183	return 0;
184	}
185
186	# Store the full record as the document text
187	$doc_obj->add_utf8_text($cursection, $$textref);
188
189	# Store workform and record number as metadata
190	$doc_obj->add_utf8_metadata($cursection, "pc.Workform", $workform_indicator);
191	$doc_obj->add_utf8_metadata($cursection, "pc.RecordNumber", $recordnum);
192
193	# Store FileFormat metadata
194	$doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
195
196	$html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
197
198	my @workform_values = @{$workform_definitions{$workform_indicator}};
199
200	# Read each field (surrounded by quotes) of the record
201	my $fieldnum = 0;
202	while ($$textref !~ /^$/) {
203	$$textref =~ s/^\"([^\"]*)\",?//;
204	my $field_value_raw = $1;
205
206	# Add non-empty metadata values to the document
207	unless ($field_value_raw eq "") {
208	# Add the display name of the metadata field for format statement convenience
209	my $field_name = $workform_values[$fieldnum];
210	unless ($field_name eq "---") {
211	my $meta_name = "pc.Field" . ($fieldnum + 1) . "Name";
212	$doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
213	}
214
215	$html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
216
217	# Multiple metadata values are separated with "//"
218	foreach $field_value (split(/\/\//, $field_value_raw)) {
219	my $meta_name = "pc.Field" . ($fieldnum + 1) . "Value";
220	$doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
221
222	$html_record .= $field_value . "<br>";
223	}
224
225	$html_record .= "</td></tr>";
226	}
227
228	$fieldnum++;
229	}
230
231	# Store HTML view of record as metadata
232	$doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
233
234	# Record was processed successfully
235	return 1;
236	}
237
238
239	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: