Context Navigation

source: gsdl/trunk/perllib/plugins/ProCitePlugin.pm@ 20747

Last change on this file since 20747 was 17480, checked in by kjdon, 16 years ago
removed the pc namespace. the metadata is now extracted metadata, and if you explode, it will get the namespace you give to the exploding script
Property svn:keywords set to `Author Date Id Revision`
File size: 7.6 KB

Line
1	###########################################################################
2	#
3	# ProCitePlugin.pm -- A plugin for (exported) ProCite databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ProCitePlugin;
28
29
30	use multiread;
31	use SplitTextFile;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ProCitePlugin is a sub-class of SplitTextFile
37	sub BEGIN {
38	@ProCitePlugin::ISA = ('SplitTextFile');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasePlugin.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "split_exp",
49	'desc' => "{SplitTextFile.split_exp}",
50	'type' => "regexp",
51	'deft' => &get_default_split_exp(),
52	'reqd' => "no" },
53
54	# The interesting options
55	{ 'name' => "entry_separator",
56	'desc' => "{ProCitePlugin.entry_separator}",
57	'type' => "string",
58	'reqd' => "no",
59	'deft' => "//" },
60	];
61
62	my $options = { 'name' => "ProCitePlugin",
63	'desc' => "{ProCitePlugin.desc}",
64	'abstract' => "no",
65	'inherits' => "yes",
66	'explodes' => "yes",
67	'args' => $arguments };
68
69
70	# This plugin processes exported ProCite files with the suffix ".txt"
71	sub get_default_process_exp
72	{
73	return q^(?i)(\.txt)$^;
74	}
75
76
77	# This plugin splits the input text at every line
78	sub get_default_split_exp
79	{
80	return q^\n^;
81	}
82
83
84	sub new
85	{
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99	my %crazy_workform_mapping =
100	( "A", "Book, Long Form",
101	"B", "Book, Short Form",
102	"C", "Journal, Long Form",
103	"D", "Journal, Short Form",
104	"E", "Report",
105	"F", "Newspaper",
106	"G", "Dissertation",
107	"H", "Trade Catalog",
108	"I", "Letter (Correspondence)",
109	"J", "Manuscript",
110	"K", "Conference Proceedings",
111	"L", "Map",
112	"M", "Music Score",
113	"N", "Sound Recording",
114	"O", "Motion Picture",
115	"P", "Audiovisual Material",
116	"Q", "Video Recording",
117	"R", "Art Work",
118	"S", "Computer Program",
119	"T", "Data File" );
120
121
122	sub read_file
123	{
124	my $self = shift (@_);
125	my ($filename, $encoding, $language, $textref) = @_;
126
127	# Store the workform definitions for this file
128	my %workform_definitions = ();
129
130	# Read the contents of the file into $textref
131	open(PROCITE_FILE, "<$filename");
132	my $reader = new multiread();
133	$reader->set_handle ('ProCitePlugin::PROCITE_FILE');
134	$reader->set_encoding ($encoding);
135	$reader->read_file ($textref);
136	close(PROCITE_FILE);
137
138	# Read the workform definitions at the start of the file
139	while ($$textref =~ /^\<Workform Definition\>/) {
140	# Remove the workform definition line so it is not processed later as a record
141	$$textref =~ s/^\<Workform Definition\>(.*)\n//;
142	my $workform_definition = $1;
143	# Parse the workform definitions and store them for later
144	$workform_definition =~ s/^\"([^\"]*)\",//;
145	my $workform_name = $1;
146	my @workform_values;
147	while ($workform_definition !~ /^\s*$/) {
148	$workform_definition =~ s/^\"([^\"]*)\",?//;
149	my $workform_field = $1;
150	push(@workform_values, $workform_field);
151	}
152
153	# Remember this workform definition for when we're reading the records
154	$workform_definitions{$workform_name} = \@workform_values;
155	}
156
157	$self->{'workform_definitions'}->{$filename} = \%workform_definitions;
158	}
159
160
161	sub process
162	{
163	my $self = shift (@_);
164	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
165
166	my $outhandle = $self->{'outhandle'};
167	my $filename = &util::filename_cat($base_dir, $file);
168	my $cursection = $doc_obj->get_top_section();
169
170	# Build up an HTML view of the record for easy display at run-time
171	my $html_record = "<table>";
172
173	# Read the record's workform indicator and record number
174	#$$textref =~ s/^\"([^\"])\",\"([^\"])\",//;
175	$$textref =~ s/^\"([^\"]*)\",//;
176	my $workform_indicator = $1;
177
178	# some procite files have a record number next
179
180	my $recordnum = $$textref =~ s/^\"(\d*)\",//;
181	$recordnum = "undefined" unless defined $recordnum;
182
183	# If necessary, map the workform indicator into something useful
184	if ($crazy_workform_mapping{$workform_indicator}) {
185	$workform_indicator = $crazy_workform_mapping{$workform_indicator};
186	}
187
188	# Check we know about the workform of this record
189	my %workform_definitions = %{$self->{'workform_definitions'}->{$filename}};
190	if (!$workform_definitions{$workform_indicator}) {
191	print STDERR "Unknown workform $workform_indicator!\n";
192	return 0;
193	}
194
195	# Store the full record as the document text
196	$doc_obj->add_utf8_text($cursection, $$textref);
197
198	# Store workform and record number as metadata
199	$doc_obj->add_utf8_metadata($cursection, "Workform", $workform_indicator);
200	$doc_obj->add_utf8_metadata($cursection, "RecordNumber", $recordnum);
201
202	# Store FileFormat metadata
203	$doc_obj->add_metadata($cursection, "FileFormat", "ProCite");
204
205	$html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
206
207	my @workform_values = @{$workform_definitions{$workform_indicator}};
208
209	# Read each field (surrounded by quotes) of the record
210	my $fieldnum = 0;
211	while ($$textref !~ /^\s*$/) {
212	$$textref =~ s/^\"([^\"]*)\",?//;
213	my $field_value_raw = $1;
214
215	# Add non-empty metadata values to the document
216	unless ($field_value_raw eq "") {
217	# Add the display name of the metadata field for format statement convenience
218	my $field_name = $workform_values[$fieldnum];
219	#unless ($field_name eq "---") {
220	# my $meta_name = "Field" . ($fieldnum + 1) . "Name";
221	# $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
222	# }
223	if ($field_name eq "---") {
224	$field_name = "Field" . ($fieldnum + 1);
225	}
226	$html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
227
228	# Multiple metadata values are separated with "//"
229	#foreach my $field_value (split(/\/\//, $field_value_raw)) {
230	foreach my $field_value (split($self->{'entry_separator'}, $field_value_raw)) {
231	#my $meta_name = "Field" . ($fieldnum + 1) . "Value";
232	#$doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
233	$doc_obj->add_utf8_metadata($cursection, $field_name, $field_value);
234	$html_record .= $field_value . "<br>";
235	}
236
237	$html_record .= "</td></tr>";
238	}
239
240	$fieldnum++;
241	}
242
243	$html_record .= "</table>";
244	# Store HTML view of record as metadata
245	$doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
246
247	# Record was processed successfully
248	return 1;
249	}
250
251
252	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: