source: main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1
2###########################################################################
3#
4# GreenstoneXMLPlugin.pm
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Processes GreenstoneArchive XML documents. Note that this plugin does no
28# syntax checking (though the XML::Parser module tests for
29# well-formedness). It's assumed that the GreenstoneArchive files conform
30# to their DTD.
31
32package GreenstoneXMLPlugin;
33
34use Encode;
35use File::Basename;
36
37use ReadXMLFile;
38use util;
39use FileUtils;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44sub BEGIN {
45 @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
46}
47
48
49
50
51sub get_default_process_exp {
52 my $self = shift (@_);
53
54 return q^(?i)doc(-\d+)?\.xml$^;
55}
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BaseImporter.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp(),
62 'reqd' => "no" } ];
63
64my $options = { 'name' => "GreenstoneXMLPlugin",
65 'desc' => "{GreenstoneXMLPlugin.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
79
80 $self->{'section'} = "";
81 $self->{'section_level'} = 0;
82 $self->{'metadata_name'} = "";
83 $self->{'metadata_value'} = "";
84 $self->{'content'} = "";
85 $self->{'metadata_read_store'} = {};
86
87# # Currently used to store information for previous values controls. In
88# # the next contract I'll move to using information directly from Lucene.
89# $self->{'sqlfh'} = 0;
90
91 return bless $self, $class;
92}
93
94
95
96
97sub metadata_read {
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $block_hash,
100 $extrametakeys, $extrametadata, $extrametafile,
101 $processor, $gli, $aux) = @_;
102
103 my $outhandle = $self->{'outhandle'};
104
105 # can we process this file??
106 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
107
108 return undef unless $self->can_process_this_file($filename_full_path);
109
110 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
111
112 print $outhandle "GreenstoneXMLPlugin: setting up block list for $file\n"
113 if $self->{'verbosity'} > 1;
114
115 my $line;
116 if (open(GIN,"<:utf8",$filename_full_path)) {
117
118 while (defined($line=<GIN>)) {
119 if ($line =~ m@<Metadata\s+name="gsdlassocfile">([^:]*):(?:[^:]*):(?:[^:]*)</Metadata>@) {
120 my $gsdl_assoc_file = $1;
121
122 my $dirname = dirname($filename_full_path);
123 my $full_gsdl_assoc_filename = &FileUtils::filenameConcatenate($dirname,$gsdl_assoc_file);
124 if ($self->{'verbosity'}>2) {
125 print $outhandle " Storing block list item: $full_gsdl_assoc_filename\n";
126 }
127
128 # is this raw filename here, or unicode?? assuming unicode
129 # however we have concatenated raw directory???
130 $self->block_filename($block_hash,$full_gsdl_assoc_filename);
131 }
132 }
133
134 close(GIN);
135 }
136 else {
137
138 print $outhandle "Error: Failed to open $file in GreenstoneXMLPlugin::metadata_read()\n";
139 print $outhandle " $!\n";
140 }
141
142
143 $self->{'metadata_read_store'}->{$filename_full_path} = 1;
144
145 return 1;
146}
147
148
149sub xml_start_document {
150
151 my $self = shift(@_);
152
153 my ($expat, $name, $sysid, $pubid, $internal) = @_;
154
155 my $outhandle = $self->{'outhandle'};
156 print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
157 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
158
159}
160
161sub xml_end_document {
162}
163
164sub get_doctype {
165 my $self = shift(@_);
166
167 return "(Greenstone)?Archive";
168}
169
170
171sub xml_doctype {
172 my $self = shift(@_);
173
174 my ($expat, $name, $sysid, $pubid, $internal) = @_;
175
176 # Some doc.xml files that have been manipulated by XML::Rules
177 # no longer have the DOCTYPE line. No obvious way to fix
178 # the XML::Rules based code to keep DOCTYPE, so commenting
179 # out the code below to allow doc.xml files with DOCTYPE
180 # to be processed
181
182 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
183 # as well as the "Archive" files which should now be created by import.pl
184## die "" if ($name !~ /^(Greenstone)?Archive$/);
185
186# my $outhandle = $self->{'outhandle'};
187# print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
188# print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
189
190}
191
192
193sub xml_start_tag {
194 my $self = shift(@_);
195 my ($expat, $element) = @_;
196
197 $self->{'element'} = $element;
198 if ($element eq "Section") {
199 if ($self->{'section_level'} == 0) {
200 $self->open_document();
201 } else {
202 my $doc_obj = $self->{'doc_obj'};
203 $self->{'section'} =
204 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
205 }
206
207 $self->{'section_level'} ++;
208 }
209 elsif ($element eq "Metadata") {
210 $self->{'metadata_name'} = $_{'name'};
211 }
212}
213
214sub xml_end_tag {
215 my $self = shift(@_);
216 my ($expat, $element) = @_;
217
218 if ($element eq "Section") {
219 $self->{'section_level'} --;
220 $self->{'section'} = $self->{'doc_obj'}->get_parent_section ($self->{'section'});
221 $self->close_document() if $self->{'section_level'} == 0;
222 }
223 elsif ($element eq "Metadata") {
224 # text read in by XML::Parser is in Perl's binary byte value
225 # form ... need to explicitly make it UTF-8
226
227
228 my $metadata_name = $self->{'metadata_name'};
229 my $metadata_value = $self->{'metadata_value'};
230 #my $metadata_name = decode("utf-8",$self->{'metadata_name'});
231 #my $metadata_value = decode("utf-8",$self->{'metadata_value'});
232
233 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
234 $metadata_name,$metadata_value);
235
236 # Ensure this value is added to the allvalues database in gseditor.
237 # Note that the database constraints prevent multiple occurances of the
238 # same key-value pair.
239 # We write these out to a file, so they can all be commited in one
240 # transaction
241 #if (!$self->{'sqlfh'})
242 # {
243 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
244 # # If the file doesn't already exist, open it and begin a transaction
245 # my $sql_fh;
246 # if (!-e $sql_file)
247 # {
248 # open($sql_fh, ">" . $sql_file);
249 # print $sql_fh "BEGIN TRANSACTION;\n";
250 # }
251 # else
252 # {
253 # open($sql_fh, ">>" . $sql_file);
254 # }
255 # print STDERR "Opened SQL log\n";
256 # $self->{'sqlfh'} = $sql_fh;
257 # }
258
259 #my $mvalue = $self->{'metadata_value'};
260 #$mvalue =~ s/\'/\'\'/g;
261 #$mvalue =~ s/_claimantsep_/ \& /g;
262
263 #my $fh = $self->{'sqlfh'};
264 #if ($fh)
265 # {
266 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
267 # }
268
269 # Clean Up
270 $self->{'metadata_name'} = "";
271 $self->{'metadata_value'} = "";
272 }
273 elsif ($element eq "Content" && $self->{'content'} ne "") {
274
275 # text read in by XML::Parser is in Perl's binary byte value
276 # form ... need to explicitly make it UTF-8
277 #my $content = decode("utf-8",$self->{'content'});
278 my $content = $self->{'content'};
279
280 $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
281 $self->{'content'} = "";
282 }
283 $self->{'element'} = "";
284}
285
286sub xml_text {
287 my $self = shift(@_);
288 my ($expat) = @_;
289
290 if ($self->{'element'} eq "Metadata") {
291 $self->{'metadata_value'} .= $_;
292 }
293 elsif ($self->{'element'} eq "Content") {
294 $self->{'content'} .= $_;
295 }
296}
297
298sub open_document {
299 my $self = shift(@_);
300
301 my $filename = $self->{'filename'};
302
303 # create a new document
304 if (defined $self->{'metadata_read_store'}->{$filename}) {
305 # Being processed as part of *import* phase
306 # (i.e. was in import directory)
307 $self->SUPER::open_document(@_);
308 delete $self->{'metadata_read_store'}->{$filename};
309 }
310 else {
311 # Otherwise being processed as part of the *buildcol* phase
312 # (i.e. named directly by ArchiveInf plugin)
313 $self->{'doc_obj'} = new doc();
314 }
315
316 $self->{'section'} = "";
317}
318
319sub close_document {
320 my $self = shift(@_);
321
322 # add the associated files
323 my $assoc_files =
324 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
325
326 # for when "assocfilepath" isn't the same directory that doc.xml is in...
327 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
328
329 my $assoc_filepath=shift (@$assoc_filepath_list);
330
331 #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
332 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
333 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
334 # Correct the assoc filepath if one is defined
335 if (defined ($assoc_filepath))
336 {
337 # Check whether the assoc_filepath already includes the base dir
338 if (index($assoc_filepath, $self->{'base_dir'}) == -1)
339 {
340 # And if not, append it so as to make this absolute
341 $assoc_filepath = &FileUtils::filenameConcatenate($self->{'base_dir'}, $assoc_filepath);
342 }
343 }
344 else
345 {
346 $assoc_filepath = $self->{'filename'};
347 $assoc_filepath =~ s/[^\\\/]*$//;
348 }
349 #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
350
351 foreach my $assoc_file_info (@$assoc_files) {
352 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
353 #rint STDERR "assoc_file: $assoc_file\n";
354 #rint STDERR "mime_type: $mime_type\n";
355 #rint STDERR "dir: $dir\n";
356 my $real_dir = &FileUtils::filenameConcatenate($assoc_filepath, $assoc_file),
357 my $assoc_dir = (defined $dir && $dir ne "")
358 ? &FileUtils::filenameConcatenate($dir, $assoc_file) : $assoc_file;
359 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
360 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
361 }
362 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
363
364 # process the document
365 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
366
367 $self->{'num_processed'} ++;
368 undef $self->{'doc_obj'};
369}
370
371
3721;
373
374
Note: See TracBrowser for help on using the repository browser.