source: trunk/gsdl/perllib/plugins/ProCitePlug.pm@ 7243

Last change on this file since 7243 was 7195, checked in by mdewsnip, 20 years ago

First cut at a plugin for processing (exported) ProCite databases. It is likely than more functionality will be added at user request.

To use it, open the database in ProCite (Windows only, I believe), mark all records, choose File -> Export Marked Records, pick Comma Delimited, then export to a text file. Put this text file in your collection's "import" folder, and add ProCitePlug into the collection configuration file (make sure the TEXTPlug is removed). Import then build collection.

Making this into a nice collection involves a bit more work. You probably want to use MGPP, and set groupsize to something like 100. You'll also need to work with format statements and choose your classifiers intelligently. Lastly, you'll probably want to build indexes on the metadata fields (and this means setting collectionmeta values to describe them too).

An example ProCite collection is /greenstone/gsdl/collect/procite on puka.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1###########################################################################
2#
3# ProCitePlug.pm -- A plugin for (exported) ProCite databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProCitePlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ProCitePlug is a sub-class of SplitPlug
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() },
46 { 'name' => "split_exp",
47 'desc' => "{SplitPlug.split_exp}",
48 'type' => "regexp",
49 'deft' => &get_default_split_exp(),
50 'reqd' => "no" }
51 ];
52
53my $options = { 'name' => "ProCitePlug",
54 'desc' => "{ProCitePlug.desc}",
55 'abstract' => "no",
56 'inherits' => "yes",
57 'args' => $arguments };
58
59
60# This plugin processes exported ProCite files with the suffix ".txt"
61sub get_default_process_exp
62{
63 return q^(?i)(\.txt)$^;
64}
65
66
67# This plugin splits the input text at every line
68sub get_default_split_exp
69{
70 return q^\n^;
71}
72
73
74sub new
75{
76 my $class = shift(@_);
77
78 my $self = new SplitPlug($class, @_);
79 if (!parsargv::parse(\@_,
80 "allow_extra_options")) {
81 die "\nIncorrect options passed to ProCitePlug, check your collect.cfg configuration file\n";
82 }
83
84 # To allow for proper inheritance of arguments
85 my $option_list = $self->{'option_list'};
86 push(@{$option_list}, $options);
87 $self->{'plugin_type'} = "ProCitePlug";
88
89 return bless $self, $class;
90}
91
92
93my %workform_definitions;
94my %crazy_workform_mapping =
95 ( "A", "Book, Long Form",
96 "B", "Book, Short Form",
97 "C", "Journal, Long Form",
98 "D", "Journal, Short Form",
99 "E", "Report",
100 "F", "Newspaper",
101 "G", "Dissertation",
102 "H", "Trade Catalog",
103 "I", "Letter (Correspondence)",
104 "J", "Manuscript",
105 "K", "Conference Proceedings",
106 "L", "Map",
107 "M", "Music Score",
108 "N", "Sound Recording",
109 "O", "Motion Picture",
110 "P", "Audiovisual Material",
111 "Q", "Video Recording",
112 "R", "Art Work",
113 "S", "Computer Program",
114 "T", "Data File" );
115
116
117sub read_file
118{
119 my $self = shift (@_);
120 my ($filename, $encoding, $language, $textref) = @_;
121
122 # Reset the workform definitions from previous files
123 %workform_definitions = ();
124
125 # Read the contents of the file into $textref
126 open(PROCITE_FILE, "<$filename");
127 my $reader = new multiread();
128 $reader->set_handle ('ProCitePlug::PROCITE_FILE');
129 $reader->set_encoding ($encoding);
130 $reader->read_file ($textref);
131 close(PROCITE_FILE);
132
133 # Read the workform definitions at the start of the file
134 while ($$textref =~ /^\<Workform Definition\>/) {
135 # Remove the workform definition line so it is not processed later as a record
136 $$textref =~ s/^\<Workform Definition\>(.*)\n//;
137 my $workform_definition = $1;
138
139 # Parse the workform definitions and store them for later
140 $workform_definition =~ s/^\"([^\"]*)\",//;
141 my $workform_name = $1;
142
143 my @workform_values;
144 while ($workform_definition !~ /^$/) {
145 $workform_definition =~ s/^\"([^\"]*)\",?//;
146 my $workform_field = $1;
147 push(@workform_values, $workform_field);
148 }
149
150 # Remember this workform definition for when we're reading the records
151 $workform_definitions{$workform_name} = \@workform_values;
152 }
153}
154
155
156sub process
157{
158 my $self = shift (@_);
159 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
160
161 my $outhandle = $self->{'outhandle'};
162
163 # Report that we're processing the file
164 print STDERR "<Processing n='$file' p='ProCitePlug'>\n" if ($gli);
165 print $outhandle "ProCitePlug: processing $file\n"
166 if ($self->{'verbosity'}) > 1;
167
168 # Build up an HTML view of the record for easy display at run-time
169 my $html_record = "";
170
171 # Read the record's workform indicator and record number
172 $$textref =~ s/^\"([^\"]*)\",\"([^\"]*)\",//;
173 my $workform_indicator = $1;
174 my $recordnum = $2;
175
176 # If necessary, map the workform indicator into something useful
177 if ($crazy_workform_mapping{$workform_indicator}) {
178 $workform_indicator = $crazy_workform_mapping{$workform_indicator};
179 }
180
181 # Check we know about the workform of this record
182 if (!$workform_definitions{$workform_indicator}) {
183 print STDERR "Unknown workform!\n";
184 return 0;
185 }
186
187 # Store the full record as the document text
188 $doc_obj->add_utf8_text($cursection, $$textref);
189
190 # Store workform and record number as metadata
191 $doc_obj->add_utf8_metadata($cursection, "pc.Workform", $workform_indicator);
192 $doc_obj->add_utf8_metadata($cursection, "pc.RecordNumber", $recordnum);
193
194 $html_record .= "<tr><td valign=top><b>Record Number: </b></td><td valign=top>$recordnum</td></tr>";
195
196 my @workform_values = @{$workform_definitions{$workform_indicator}};
197
198 # Read each field (surrounded by quotes) of the record
199 my $fieldnum = 0;
200 while ($$textref !~ /^$/) {
201 $$textref =~ s/^\"([^\"]*)\",?//;
202 my $field_value_raw = $1;
203
204 # Add non-empty metadata values to the document
205 unless ($field_value_raw eq "") {
206 # Add the display name of the metadata field for format statement convenience
207 my $field_name = $workform_values[$fieldnum];
208 unless ($field_name eq "---") {
209 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Name";
210 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_name);
211 }
212
213 $html_record .= "<tr><td valign=top><b>$field_name: </b></td><td valign=top>";
214
215 # Multiple metadata values are separated with "//"
216 foreach $field_value (split(/\/\//, $field_value_raw)) {
217 my $meta_name = "pc.Field" . ($fieldnum + 1) . "Value";
218 $doc_obj->add_utf8_metadata($cursection, $meta_name, $field_value);
219
220 $html_record .= $field_value . "<br>";
221 }
222
223 $html_record .= "</td></tr>";
224 }
225
226 $fieldnum++;
227 }
228
229 # Store HTML view of record as metadata
230 $doc_obj->add_utf8_metadata($cursection, "HTMLDisplay", $html_record);
231
232 # Record was processed successfully
233 return 1;
234}
235
236
2371;
Note: See TracBrowser for help on using the repository browser.