source: main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm@ 28836

Last change on this file since 28836 was 28836, checked in by ak19, 10 years ago

A question on the mailing list involved accented characters in custom metadata set names (not metadata set values). This exposed an issue in greenstone that could not cope with utf8 characters in metaset names. The cause was the sub Char { use bytes; ... lines when reading XML. These needed to be commented out in both MetadataXMLPlugin and ReadXMLFile (as GreenstoneXMLPlugin inherits from ReadXMLFile). Doing so showed that extra Encode::decode() operations to decode strings read in from XML into utf8 were no longer needed. As a result MetaXMLPlug and GreenstoneXMLPlug no longer call decode on the metadaname name and value read in from XML, or for the full-text, since GreenstoneXMLPlugin in entirety now no longer does the 'use bytes' part. Tested with text and html collections where metadata set nanes created in custom .mds files, their assigned metadata values and a document's full-text all contained the utf-8 specific character of a-macron.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1
2###########################################################################
3#
4# GreenstoneXMLPlugin.pm
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Processes GreenstoneArchive XML documents. Note that this plugin does no
28# syntax checking (though the XML::Parser module tests for
29# well-formedness). It's assumed that the GreenstoneArchive files conform
30# to their DTD.
31
32package GreenstoneXMLPlugin;
33
34use Encode;
35use File::Basename;
36
37use ReadXMLFile;
38use util;
39use FileUtils;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44sub BEGIN {
45 @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
46}
47
48
49
50
51sub get_default_process_exp {
52 my $self = shift (@_);
53
54 return q^(?i)doc(-\d+)?\.xml$^;
55}
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp(),
62 'reqd' => "no" } ];
63
64my $options = { 'name' => "GreenstoneXMLPlugin",
65 'desc' => "{GreenstoneXMLPlugin.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
79
80 $self->{'section'} = "";
81 $self->{'section_level'} = 0;
82 $self->{'metadata_name'} = "";
83 $self->{'metadata_value'} = "";
84 $self->{'content'} = "";
85 $self->{'metadata_read_store'} = {};
86
87# # Currently used to store information for previous values controls. In
88# # the next contract I'll move to using information directly from Lucene.
89# $self->{'sqlfh'} = 0;
90
91 return bless $self, $class;
92}
93
94
95
96
97sub metadata_read {
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $block_hash,
100 $extrametakeys, $extrametadata, $extrametafile,
101 $processor, $gli, $aux) = @_;
102
103 my $outhandle = $self->{'outhandle'};
104
105 # can we process this file??
106 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
107
108 return undef unless $self->can_process_this_file($filename_full_path);
109
110 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
111
112 print $outhandle "GreenstoneXMLlugin: setting up block list for $file\n"
113 if $self->{'verbosity'} > 1;
114
115 my $line;
116 if (open(GIN,"<:utf8",$filename_full_path)) {
117
118 while (defined($line=<GIN>)) {
119 if ($line =~ m@<Metadata\s+name="gsdlassocfile">([^:]*):(?:[^:]*):(?:[^:]*)</Metadata>@) {
120 my $gsdl_assoc_file = $1;
121
122 my $dirname = dirname($filename_full_path);
123 my $full_gsdl_assoc_filename = &FileUtils::filenameConcatenate($dirname,$gsdl_assoc_file);
124 if ($self->{'verbosity'}>2) {
125 print $outhandle " Storing block list item: $full_gsdl_assoc_filename\n";
126 }
127
128 &util::block_filename($block_hash,$full_gsdl_assoc_filename);
129 }
130 }
131
132 close(GIN);
133 }
134 else {
135
136 print $outhandle "Error: Failed to open $file in GreenstoneXMLPlugin::metadata_read()\n";
137 print $outhandle " $!\n";
138 }
139
140
141 $self->{'metadata_read_store'}->{$filename_full_path} = 1;
142
143 return 1;
144}
145
146
147sub xml_start_document {
148
149 my $self = shift(@_);
150
151 my ($expat, $name, $sysid, $pubid, $internal) = @_;
152
153 my $outhandle = $self->{'outhandle'};
154 print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
155 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
156
157}
158
159sub xml_end_document {
160}
161
162sub get_doctype {
163 my $self = shift(@_);
164
165 return "(Greenstone)?Archive";
166}
167
168
169sub xml_doctype {
170 my $self = shift(@_);
171
172 my ($expat, $name, $sysid, $pubid, $internal) = @_;
173
174 # Some doc.xml files that have been manipulated by XML::Rules
175 # no longer have the DOCTYPE line. No obvious way to fix
176 # the XML::Rules based code to keep DOCTYPE, so commenting
177 # out the code below to allow doc.xml files with DOCTYPE
178 # to be processed
179
180 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
181 # as well as the "Archive" files which should now be created by import.pl
182## die "" if ($name !~ /^(Greenstone)?Archive$/);
183
184# my $outhandle = $self->{'outhandle'};
185# print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
186# print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
187
188}
189
190
191sub xml_start_tag {
192 my $self = shift(@_);
193 my ($expat, $element) = @_;
194
195 $self->{'element'} = $element;
196 if ($element eq "Section") {
197 if ($self->{'section_level'} == 0) {
198 $self->open_document();
199 } else {
200 my $doc_obj = $self->{'doc_obj'};
201 $self->{'section'} =
202 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
203 }
204
205 $self->{'section_level'} ++;
206 }
207 elsif ($element eq "Metadata") {
208 $self->{'metadata_name'} = $_{'name'};
209 }
210}
211
212sub xml_end_tag {
213 my $self = shift(@_);
214 my ($expat, $element) = @_;
215
216 if ($element eq "Section") {
217 $self->{'section_level'} --;
218 $self->{'section'} = $self->{'doc_obj'}->get_parent_section ($self->{'section'});
219 $self->close_document() if $self->{'section_level'} == 0;
220 }
221 elsif ($element eq "Metadata") {
222 # text read in by XML::Parser is in Perl's binary byte value
223 # form ... need to explicitly make it UTF-8
224
225
226 my $metadata_name = $self->{'metadata_name'};
227 my $metadata_value = $self->{'metadata_value'};
228 #my $metadata_name = decode("utf-8",$self->{'metadata_name'});
229 #my $metadata_value = decode("utf-8",$self->{'metadata_value'});
230
231 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
232 $metadata_name,$metadata_value);
233
234 # Ensure this value is added to the allvalues database in gseditor.
235 # Note that the database constraints prevent multiple occurances of the
236 # same key-value pair.
237 # We write these out to a file, so they can all be commited in one
238 # transaction
239 #if (!$self->{'sqlfh'})
240 # {
241 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
242 # # If the file doesn't already exist, open it and begin a transaction
243 # my $sql_fh;
244 # if (!-e $sql_file)
245 # {
246 # open($sql_fh, ">" . $sql_file);
247 # print $sql_fh "BEGIN TRANSACTION;\n";
248 # }
249 # else
250 # {
251 # open($sql_fh, ">>" . $sql_file);
252 # }
253 # print STDERR "Opened SQL log\n";
254 # $self->{'sqlfh'} = $sql_fh;
255 # }
256
257 #my $mvalue = $self->{'metadata_value'};
258 #$mvalue =~ s/\'/\'\'/g;
259 #$mvalue =~ s/_claimantsep_/ \& /g;
260
261 #my $fh = $self->{'sqlfh'};
262 #if ($fh)
263 # {
264 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
265 # }
266
267 # Clean Up
268 $self->{'metadata_name'} = "";
269 $self->{'metadata_value'} = "";
270 }
271 elsif ($element eq "Content" && $self->{'content'} ne "") {
272
273 # text read in by XML::Parser is in Perl's binary byte value
274 # form ... need to explicitly make it UTF-8
275 #my $content = decode("utf-8",$self->{'content'});
276 my $content = $self->{'content'};
277
278 $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
279 $self->{'content'} = "";
280 }
281 $self->{'element'} = "";
282}
283
284sub xml_text {
285 my $self = shift(@_);
286 my ($expat) = @_;
287
288 if ($self->{'element'} eq "Metadata") {
289 $self->{'metadata_value'} .= $_;
290 }
291 elsif ($self->{'element'} eq "Content") {
292 $self->{'content'} .= $_;
293 }
294}
295
296sub open_document {
297 my $self = shift(@_);
298
299 my $filename = $self->{'filename'};
300
301 # create a new document
302 if (defined $self->{'metadata_read_store'}->{$filename}) {
303 # Being processed as part of *import* phase
304 # (i.e. was in import directory)
305 $self->SUPER::open_document(@_);
306 delete $self->{'metadata_read_store'}->{$filename};
307 }
308 else {
309 # Otherwise being processed as part of the *buildcol* phase
310 # (i.e. named directly by ArchiveInf plugin)
311 $self->{'doc_obj'} = new doc();
312 }
313
314 $self->{'section'} = "";
315}
316
317sub close_document {
318 my $self = shift(@_);
319
320 # add the associated files
321 my $assoc_files =
322 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
323
324 # for when "assocfilepath" isn't the same directory that doc.xml is in...
325 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
326
327 my $assoc_filepath=shift (@$assoc_filepath_list);
328
329 #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
330 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
331 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
332 # Correct the assoc filepath if one is defined
333 if (defined ($assoc_filepath))
334 {
335 # Check whether the assoc_filepath already includes the base dir
336 if (index($assoc_filepath, $self->{'base_dir'}) == -1)
337 {
338 # And if not, append it so as to make this absolute
339 $assoc_filepath = &FileUtils::filenameConcatenate($self->{'base_dir'}, $assoc_filepath);
340 }
341 }
342 else
343 {
344 $assoc_filepath = $self->{'filename'};
345 $assoc_filepath =~ s/[^\\\/]*$//;
346 }
347 #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
348
349 foreach my $assoc_file_info (@$assoc_files) {
350 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
351 #rint STDERR "assoc_file: $assoc_file\n";
352 #rint STDERR "mime_type: $mime_type\n";
353 #rint STDERR "dir: $dir\n";
354 my $real_dir = &FileUtils::filenameConcatenate($assoc_filepath, $assoc_file),
355 my $assoc_dir = (defined $dir && $dir ne "")
356 ? &FileUtils::filenameConcatenate($dir, $assoc_file) : $assoc_file;
357 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
358 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
359 }
360 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
361
362 # process the document
363 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
364
365 $self->{'num_processed'} ++;
366 undef $self->{'doc_obj'};
367}
368
369
3701;
371
372
Note: See TracBrowser for help on using the repository browser.