source: main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm@ 23167

Last change on this file since 23167 was 23167, checked in by davidb, 14 years ago

GreenstoneXMLPlugin used to (or at least in theory used to) to be able to process files without them needing to be fed down the pipeline from ArchiveInfPlugin; however, this has not been tested for a long time. A very long time. It's certainly the case its not working now. For some new work, we need GreenstoneXMLPlugin to work correctly when processing doc.xml (and associated files all in the same folder) directly. For this new use, this occurs when it is placed in the *import* folder (part of the DL talkback scheme). The mods being commited here to the plugin give this capability back to the plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.2 KB
Line 
1
2###########################################################################
3#
4# GreenstoneXMLPlugin.pm
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Processes GreenstoneArchive XML documents. Note that this plugin does no
28# syntax checking (though the XML::Parser module tests for
29# well-formedness). It's assumed that the GreenstoneArchive files conform
30# to their DTD.
31
32package GreenstoneXMLPlugin;
33
34use Encode;
35use File::Basename;
36
37use ReadXMLFile;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42sub BEGIN {
43 @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
44}
45
46
47
48
49sub get_default_process_exp {
50 my $self = shift (@_);
51
52 return q^(?i)doc\.xml$^;
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'deft' => &get_default_process_exp(),
60 'reqd' => "no" } ];
61
62my $options = { 'name' => "GreenstoneXMLPlugin",
63 'desc' => "{GreenstoneXMLPlugin.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments };
67
68sub new {
69 my ($class) = shift (@_);
70 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
71 push(@$pluginlist, $class);
72
73 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
74 push(@{$hashArgOptLists->{"OptList"}},$options);
75
76 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
77
78 $self->{'section'} = "";
79 $self->{'section_level'} = 0;
80 $self->{'metadata_name'} = "";
81 $self->{'metadata_value'} = "";
82 $self->{'content'} = "";
83 $self->{'metadata_read_store'} = {};
84
85# # Currently used to store information for previous values controls. In
86# # the next contract I'll move to using information directly from Lucene.
87# $self->{'sqlfh'} = 0;
88
89 return bless $self, $class;
90}
91
92
93
94
95sub metadata_read {
96 my $self = shift (@_);
97 my ($pluginfo, $base_dir, $file, $block_hash,
98 $extrametakeys, $extrametadata, $extrametafile,
99 $processor, $maxdocs, $gli) = @_;
100
101 my $outhandle = $self->{'outhandle'};
102
103 # can we process this file??
104 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
105 return undef unless $self->can_process_this_file($filename_full_path);
106
107 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
108
109 print $outhandle "GreenstoneXMLlugin: setting up block list for $file\n"
110 if $self->{'verbosity'} > 1;
111
112 my $line;
113 if (open(GIN,"<:utf8",$filename_full_path)) {
114
115 while (defined($line=<GIN>)) {
116 if ($line =~ m@<Metadata\s+name="gsdlassocfile">([^:]*):(?:[^:]*):(?:[^:]*)</Metadata>@) {
117 my $gsdl_assoc_file = $1;
118
119 my $dirname = dirname($filename_full_path);
120 my $full_gsdl_assoc_filename = &util::filename_cat($dirname,$gsdl_assoc_file);
121 if ($self->{'verbosity'}>2) {
122 print $outhandle " Storing block list item: $full_gsdl_assoc_filename\n";
123 }
124
125 $block_hash->{'file_blocks'}->{$full_gsdl_assoc_filename} = 1;
126 }
127 }
128
129 close(GIN);
130 }
131 else {
132
133 print $outhandle "Error: Failed to open $file in GreenstoneXMLPlugin::metadata_read()\n";
134 print $outhandle " $!\n";
135 }
136
137
138 $self->{'metadata_read_store'}->{$filename_full_path} = 1;
139
140 return 1;
141}
142
143
144sub xml_start_document {
145}
146
147sub xml_end_document {
148}
149
150sub get_doctype {
151 my $self = shift(@_);
152
153 return "(Greenstone)?Archive";
154}
155
156
157sub xml_doctype {
158 my $self = shift(@_);
159
160 my ($expat, $name, $sysid, $pubid, $internal) = @_;
161
162 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
163 # as well as the "Archive" files which should now be created by import.pl
164 die "" if ($name !~ /^(Greenstone)?Archive$/);
165
166 my $outhandle = $self->{'outhandle'};
167 print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
168 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
169
170}
171
172
173sub xml_start_tag {
174 my $self = shift(@_);
175 my ($expat, $element) = @_;
176
177 $self->{'element'} = $element;
178 if ($element eq "Section") {
179 if ($self->{'section_level'} == 0) {
180 $self->open_document();
181 } else {
182 my $doc_obj = $self->{'doc_obj'};
183 $self->{'section'} =
184 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
185 }
186
187 $self->{'section_level'} ++;
188 }
189 elsif ($element eq "Metadata") {
190 $self->{'metadata_name'} = $_{'name'};
191 }
192}
193
194sub xml_end_tag {
195 my $self = shift(@_);
196 my ($expat, $element) = @_;
197
198 if ($element eq "Section") {
199 $self->{'section_level'} --;
200 $self->{'section'} = $self->{'doc_obj'}->get_parent_section ($self->{'section'});
201 $self->close_document() if $self->{'section_level'} == 0;
202 }
203 elsif ($element eq "Metadata") {
204 # text read in by XML::Parser is in Perl's binary byte value
205 # form ... need to explicitly make it UTF-8
206
207 my $metadata_name = decode("utf-8",$self->{'metadata_name'});
208 my $metadata_value = decode("utf-8",$self->{'metadata_value'});
209
210 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
211 $metadata_name,$metadata_value);
212
213 # Ensure this value is added to the allvalues database in gseditor.
214 # Note that the database constraints prevent multiple occurances of the
215 # same key-value pair.
216 # We write these out to a file, so they can all be commited in one
217 # transaction
218 #if (!$self->{'sqlfh'})
219 # {
220 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
221 # # If the file doesn't already exist, open it and begin a transaction
222 # my $sql_fh;
223 # if (!-e $sql_file)
224 # {
225 # open($sql_fh, ">" . $sql_file);
226 # print $sql_fh "BEGIN TRANSACTION;\n";
227 # }
228 # else
229 # {
230 # open($sql_fh, ">>" . $sql_file);
231 # }
232 # print STDERR "Opened SQL log\n";
233 # $self->{'sqlfh'} = $sql_fh;
234 # }
235
236 #my $mvalue = $self->{'metadata_value'};
237 #$mvalue =~ s/\'/\'\'/g;
238 #$mvalue =~ s/_claimantsep_/ \& /g;
239
240 #my $fh = $self->{'sqlfh'};
241 #if ($fh)
242 # {
243 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
244 # }
245
246 # Clean Up
247 $self->{'metadata_name'} = "";
248 $self->{'metadata_value'} = "";
249 }
250 elsif ($element eq "Content" && $self->{'content'} ne "") {
251
252 # text read in by XML::Parser is in Perl's binary byte value
253 # form ... need to explicitly make it UTF-8
254 my $content = decode("utf-8",$self->{'content'});
255
256 $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
257 $self->{'content'} = "";
258 }
259 $self->{'element'} = "";
260}
261
262sub xml_text {
263 my $self = shift(@_);
264 my ($expat) = @_;
265
266 if ($self->{'element'} eq "Metadata") {
267 $self->{'metadata_value'} .= $_;
268 }
269 elsif ($self->{'element'} eq "Content") {
270 $self->{'content'} .= $_;
271 }
272}
273
274sub open_document {
275 my $self = shift(@_);
276
277 my $filename = $self->{'filename'};
278
279 # create a new document
280 if (defined $self->{'metadata_read_store'}->{$filename}) {
281 # Being processed as part of *import* phase
282 # (i.e. was in import directory)
283 $self->SUPER::open_document(@_);
284 delete $self->{'metadata_read_store'}->{$filename};
285 }
286 else {
287 # Otherwise being processed as part of the *buildcol* phase
288 # (i.e. named directly by ArchiveInf plugin)
289 $self->{'doc_obj'} = new doc();
290 }
291
292 $self->{'section'} = "";
293}
294
295sub close_document {
296 my $self = shift(@_);
297
298 # add the associated files
299 my $assoc_files =
300 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
301
302 # for when "assocfilepath" isn't the same directory that doc.xml is in...
303 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
304
305 my $assoc_filepath=shift (@$assoc_filepath_list);
306
307 #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
308 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
309 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
310 # Correct the assoc filepath if one is defined
311 if (defined ($assoc_filepath))
312 {
313 # Check whether the assoc_filepath already includes the base dir
314 if (index($assoc_filepath, $self->{'base_dir'}) == -1)
315 {
316 # And if not, append it so as to make this absolute
317 $assoc_filepath = &util::filename_cat($self->{'base_dir'}, $assoc_filepath);
318 }
319 }
320 else
321 {
322 $assoc_filepath = $self->{'filename'};
323 $assoc_filepath =~ s/[^\\\/]*$//;
324 }
325 #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
326
327 foreach my $assoc_file_info (@$assoc_files) {
328 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
329 #rint STDERR "assoc_file: $assoc_file\n";
330 #rint STDERR "mime_type: $mime_type\n";
331 #rint STDERR "dir: $dir\n";
332 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
333 my $assoc_dir = (defined $dir && $dir ne "")
334 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
335 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
336 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
337 }
338 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
339
340 # process the document
341 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
342}
343
344
3451;
346
347
Note: See TracBrowser for help on using the repository browser.