source: main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1
2###########################################################################
3#
4# GreenstoneXMLPlugin.pm
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Processes GreenstoneArchive XML documents. Note that this plugin does no
28# syntax checking (though the XML::Parser module tests for
29# well-formedness). It's assumed that the GreenstoneArchive files conform
30# to their DTD.
31
32package GreenstoneXMLPlugin;
33
34use Encode;
35use File::Basename;
36
37use ReadXMLFile;
38use util;
39use FileUtils;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44sub BEGIN {
45 @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
46}
47
48
49
50
51sub get_default_process_exp {
52 my $self = shift (@_);
53
54 return q^(?i)doc\.xml$^;
55}
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp(),
62 'reqd' => "no" } ];
63
64my $options = { 'name' => "GreenstoneXMLPlugin",
65 'desc' => "{GreenstoneXMLPlugin.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
79
80 $self->{'section'} = "";
81 $self->{'section_level'} = 0;
82 $self->{'metadata_name'} = "";
83 $self->{'metadata_value'} = "";
84 $self->{'content'} = "";
85 $self->{'metadata_read_store'} = {};
86
87# # Currently used to store information for previous values controls. In
88# # the next contract I'll move to using information directly from Lucene.
89# $self->{'sqlfh'} = 0;
90
91 return bless $self, $class;
92}
93
94
95
96
97sub metadata_read {
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $block_hash,
100 $extrametakeys, $extrametadata, $extrametafile,
101 $processor, $gli, $aux) = @_;
102
103 my $outhandle = $self->{'outhandle'};
104
105 # can we process this file??
106 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
107 return undef unless $self->can_process_this_file($filename_full_path);
108
109 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
110
111 print $outhandle "GreenstoneXMLlugin: setting up block list for $file\n"
112 if $self->{'verbosity'} > 1;
113
114 my $line;
115 if (open(GIN,"<:utf8",$filename_full_path)) {
116
117 while (defined($line=<GIN>)) {
118 if ($line =~ m@<Metadata\s+name="gsdlassocfile">([^:]*):(?:[^:]*):(?:[^:]*)</Metadata>@) {
119 my $gsdl_assoc_file = $1;
120
121 my $dirname = dirname($filename_full_path);
122 my $full_gsdl_assoc_filename = &FileUtils::filenameConcatenate($dirname,$gsdl_assoc_file);
123 if ($self->{'verbosity'}>2) {
124 print $outhandle " Storing block list item: $full_gsdl_assoc_filename\n";
125 }
126
127 &util::block_filename($block_hash,$full_gsdl_assoc_filename);
128 }
129 }
130
131 close(GIN);
132 }
133 else {
134
135 print $outhandle "Error: Failed to open $file in GreenstoneXMLPlugin::metadata_read()\n";
136 print $outhandle " $!\n";
137 }
138
139
140 $self->{'metadata_read_store'}->{$filename_full_path} = 1;
141
142 return 1;
143}
144
145
146sub xml_start_document {
147}
148
149sub xml_end_document {
150}
151
152sub get_doctype {
153 my $self = shift(@_);
154
155 return "(Greenstone)?Archive";
156}
157
158
159sub xml_doctype {
160 my $self = shift(@_);
161
162 my ($expat, $name, $sysid, $pubid, $internal) = @_;
163
164 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
165 # as well as the "Archive" files which should now be created by import.pl
166 die "" if ($name !~ /^(Greenstone)?Archive$/);
167
168 my $outhandle = $self->{'outhandle'};
169 print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
170 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
171
172}
173
174
175sub xml_start_tag {
176 my $self = shift(@_);
177 my ($expat, $element) = @_;
178
179 $self->{'element'} = $element;
180 if ($element eq "Section") {
181 if ($self->{'section_level'} == 0) {
182 $self->open_document();
183 } else {
184 my $doc_obj = $self->{'doc_obj'};
185 $self->{'section'} =
186 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
187 }
188
189 $self->{'section_level'} ++;
190 }
191 elsif ($element eq "Metadata") {
192 $self->{'metadata_name'} = $_{'name'};
193 }
194}
195
196sub xml_end_tag {
197 my $self = shift(@_);
198 my ($expat, $element) = @_;
199
200 if ($element eq "Section") {
201 $self->{'section_level'} --;
202 $self->{'section'} = $self->{'doc_obj'}->get_parent_section ($self->{'section'});
203 $self->close_document() if $self->{'section_level'} == 0;
204 }
205 elsif ($element eq "Metadata") {
206 # text read in by XML::Parser is in Perl's binary byte value
207 # form ... need to explicitly make it UTF-8
208
209
210 my $metadata_name = decode("utf-8",$self->{'metadata_name'});
211 my $metadata_value = decode("utf-8",$self->{'metadata_value'});
212
213 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
214 $metadata_name,$metadata_value);
215
216 # Ensure this value is added to the allvalues database in gseditor.
217 # Note that the database constraints prevent multiple occurances of the
218 # same key-value pair.
219 # We write these out to a file, so they can all be commited in one
220 # transaction
221 #if (!$self->{'sqlfh'})
222 # {
223 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
224 # # If the file doesn't already exist, open it and begin a transaction
225 # my $sql_fh;
226 # if (!-e $sql_file)
227 # {
228 # open($sql_fh, ">" . $sql_file);
229 # print $sql_fh "BEGIN TRANSACTION;\n";
230 # }
231 # else
232 # {
233 # open($sql_fh, ">>" . $sql_file);
234 # }
235 # print STDERR "Opened SQL log\n";
236 # $self->{'sqlfh'} = $sql_fh;
237 # }
238
239 #my $mvalue = $self->{'metadata_value'};
240 #$mvalue =~ s/\'/\'\'/g;
241 #$mvalue =~ s/_claimantsep_/ \& /g;
242
243 #my $fh = $self->{'sqlfh'};
244 #if ($fh)
245 # {
246 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
247 # }
248
249 # Clean Up
250 $self->{'metadata_name'} = "";
251 $self->{'metadata_value'} = "";
252 }
253 elsif ($element eq "Content" && $self->{'content'} ne "") {
254
255 # text read in by XML::Parser is in Perl's binary byte value
256 # form ... need to explicitly make it UTF-8
257 my $content = decode("utf-8",$self->{'content'});
258
259 $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
260 $self->{'content'} = "";
261 }
262 $self->{'element'} = "";
263}
264
265sub xml_text {
266 my $self = shift(@_);
267 my ($expat) = @_;
268
269 if ($self->{'element'} eq "Metadata") {
270 $self->{'metadata_value'} .= $_;
271 }
272 elsif ($self->{'element'} eq "Content") {
273 $self->{'content'} .= $_;
274 }
275}
276
277sub open_document {
278 my $self = shift(@_);
279
280 my $filename = $self->{'filename'};
281
282 # create a new document
283 if (defined $self->{'metadata_read_store'}->{$filename}) {
284 # Being processed as part of *import* phase
285 # (i.e. was in import directory)
286 $self->SUPER::open_document(@_);
287 delete $self->{'metadata_read_store'}->{$filename};
288 }
289 else {
290 # Otherwise being processed as part of the *buildcol* phase
291 # (i.e. named directly by ArchiveInf plugin)
292 $self->{'doc_obj'} = new doc();
293 }
294
295 $self->{'section'} = "";
296}
297
298sub close_document {
299 my $self = shift(@_);
300
301 # add the associated files
302 my $assoc_files =
303 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
304
305 # for when "assocfilepath" isn't the same directory that doc.xml is in...
306 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
307
308 my $assoc_filepath=shift (@$assoc_filepath_list);
309
310 #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
311 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
312 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
313 # Correct the assoc filepath if one is defined
314 if (defined ($assoc_filepath))
315 {
316 # Check whether the assoc_filepath already includes the base dir
317 if (index($assoc_filepath, $self->{'base_dir'}) == -1)
318 {
319 # And if not, append it so as to make this absolute
320 $assoc_filepath = &FileUtils::filenameConcatenate($self->{'base_dir'}, $assoc_filepath);
321 }
322 }
323 else
324 {
325 $assoc_filepath = $self->{'filename'};
326 $assoc_filepath =~ s/[^\\\/]*$//;
327 }
328 #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
329
330 foreach my $assoc_file_info (@$assoc_files) {
331 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
332 #rint STDERR "assoc_file: $assoc_file\n";
333 #rint STDERR "mime_type: $mime_type\n";
334 #rint STDERR "dir: $dir\n";
335 my $real_dir = &FileUtils::filenameConcatenate($assoc_filepath, $assoc_file),
336 my $assoc_dir = (defined $dir && $dir ne "")
337 ? &FileUtils::filenameConcatenate($dir, $assoc_file) : $assoc_file;
338 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
339 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
340 }
341 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
342
343 # process the document
344 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
345
346 $self->{'num_processed'} ++;
347 undef $self->{'doc_obj'};
348}
349
350
3511;
352
353
Note: See TracBrowser for help on using the repository browser.