source: main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm@ 22841

Last change on this file since 22841 was 22841, checked in by davidb, 14 years ago

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, both text and metadata (read from doc.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1
2###########################################################################
3#
4# GreenstoneXMLPlugin.pm
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Processes GreenstoneArchive XML documents. Note that this plugin does no
28# syntax checking (though the XML::Parser module tests for
29# well-formedness). It's assumed that the GreenstoneArchive files conform
30# to their DTD.
31
32package GreenstoneXMLPlugin;
33
34use Encode;
35
36use ReadXMLFile;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41sub BEGIN {
42 @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
43}
44
45
46
47
48sub get_default_process_exp {
49 my $self = shift (@_);
50
51 return q^(?i)doc\.xml$^;
52}
53
54my $arguments =
55 [ { 'name' => "process_exp",
56 'desc' => "{BasePlugin.process_exp}",
57 'type' => "regexp",
58 'deft' => &get_default_process_exp(),
59 'reqd' => "no" } ];
60
61my $options = { 'name' => "GreenstoneXMLPlugin",
62 'desc' => "{GreenstoneXMLPlugin.desc}",
63 'abstract' => "no",
64 'inherits' => "yes",
65 'args' => $arguments };
66
67sub new {
68 my ($class) = shift (@_);
69 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
70 push(@$pluginlist, $class);
71
72 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
73 push(@{$hashArgOptLists->{"OptList"}},$options);
74
75 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
76
77 $self->{'section'} = "";
78 $self->{'section_level'} = 0;
79 $self->{'metadata_name'} = "";
80 $self->{'metadata_value'} = "";
81 $self->{'content'} = "";
82
83# # Currently used to store information for previous values controls. In
84# # the next contract I'll move to using information directly from Lucene.
85# $self->{'sqlfh'} = 0;
86
87 return bless $self, $class;
88}
89
90sub xml_start_document {
91}
92
93sub xml_end_document {
94}
95
96sub get_doctype {
97 my $self = shift(@_);
98
99 return "(Greenstone)?Archive";
100}
101
102
103sub xml_doctype {
104 my $self = shift(@_);
105
106 my ($expat, $name, $sysid, $pubid, $internal) = @_;
107
108 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
109 # as well as the "Archive" files which should now be created by import.pl
110 die "" if ($name !~ /^(Greenstone)?Archive$/);
111
112 my $outhandle = $self->{'outhandle'};
113 print $outhandle "GreenstoneXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
114 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneXMLPlugin'>\n" if $self->{'gli'};
115
116}
117
118
119sub xml_start_tag {
120 my $self = shift(@_);
121 my ($expat, $element) = @_;
122
123 $self->{'element'} = $element;
124 if ($element eq "Section") {
125 if ($self->{'section_level'} == 0) {
126 $self->open_document();
127 } else {
128 my $doc_obj = $self->{'doc_obj'};
129 $self->{'section'} =
130 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
131 }
132
133 $self->{'section_level'} ++;
134 }
135 elsif ($element eq "Metadata") {
136 $self->{'metadata_name'} = $_{'name'};
137 }
138}
139
140sub xml_end_tag {
141 my $self = shift(@_);
142 my ($expat, $element) = @_;
143
144 if ($element eq "Section") {
145 $self->{'section_level'} --;
146 $self->{'section'} = $self->{'doc_obj'}->get_parent_section ($self->{'section'});
147 $self->close_document() if $self->{'section_level'} == 0;
148 }
149 elsif ($element eq "Metadata") {
150 # text read in by XML::Parser is in Perl's binary byte value
151 # form ... need to explicitly make it UTF-8
152
153 my $metadata_name = decode("utf-8",$self->{'metadata_name'});
154 my $metadata_value = decode("utf-8",$self->{'metadata_value'});
155
156 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
157 $metadata_name,$metadata_value);
158
159 # Ensure this value is added to the allvalues database in gseditor.
160 # Note that the database constraints prevent multiple occurances of the
161 # same key-value pair.
162 # We write these out to a file, so they can all be commited in one
163 # transaction
164 #if (!$self->{'sqlfh'})
165 # {
166 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
167 # # If the file doesn't already exist, open it and begin a transaction
168 # my $sql_fh;
169 # if (!-e $sql_file)
170 # {
171 # open($sql_fh, ">" . $sql_file);
172 # print $sql_fh "BEGIN TRANSACTION;\n";
173 # }
174 # else
175 # {
176 # open($sql_fh, ">>" . $sql_file);
177 # }
178 # print STDERR "Opened SQL log\n";
179 # $self->{'sqlfh'} = $sql_fh;
180 # }
181
182 #my $mvalue = $self->{'metadata_value'};
183 #$mvalue =~ s/\'/\'\'/g;
184 #$mvalue =~ s/_claimantsep_/ \& /g;
185
186 #my $fh = $self->{'sqlfh'};
187 #if ($fh)
188 # {
189 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
190 # }
191
192 # Clean Up
193 $self->{'metadata_name'} = "";
194 $self->{'metadata_value'} = "";
195 }
196 elsif ($element eq "Content" && $self->{'content'} ne "") {
197
198 # text read in by XML::Parser is in Perl's binary byte value
199 # form ... need to explicitly make it UTF-8
200 my $content = decode("utf-8",$self->{'content'});
201
202 $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
203 $self->{'content'} = "";
204 }
205 $self->{'element'} = "";
206}
207
208sub xml_text {
209 my $self = shift(@_);
210 my ($expat) = @_;
211
212 if ($self->{'element'} eq "Metadata") {
213 $self->{'metadata_value'} .= $_;
214 }
215 elsif ($self->{'element'} eq "Content") {
216 $self->{'content'} .= $_;
217 }
218}
219
220sub open_document {
221 my $self = shift(@_);
222
223 # create a new document
224 $self->{'doc_obj'} = new doc ();
225 $self->{'section'} = "";
226}
227
228sub close_document {
229 my $self = shift(@_);
230
231 # add the associated files
232 my $assoc_files =
233 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
234
235 # for when "assocfilepath" isn't the same directory that doc.xml is in...
236 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
237
238 my $assoc_filepath=shift (@$assoc_filepath_list);
239
240 #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
241 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
242 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
243 # Correct the assoc filepath if one is defined
244 if (defined ($assoc_filepath))
245 {
246 # Check whether the assoc_filepath already includes the base dir
247 if (index($assoc_filepath, $self->{'base_dir'}) == -1)
248 {
249 # And if not, append it so as to make this absolute
250 $assoc_filepath = &util::filename_cat($self->{'base_dir'}, $assoc_filepath);
251 }
252 }
253 else
254 {
255 $assoc_filepath = $self->{'filename'};
256 $assoc_filepath =~ s/[^\\\/]*$//;
257 }
258 #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
259
260 foreach my $assoc_file_info (@$assoc_files) {
261 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
262 #rint STDERR "assoc_file: $assoc_file\n";
263 #rint STDERR "mime_type: $mime_type\n";
264 #rint STDERR "dir: $dir\n";
265 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
266 my $assoc_dir = (defined $dir && $dir ne "")
267 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
268 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
269 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
270 }
271 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
272
273 # process the document
274 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
275}
276
277
2781;
279
280
Note: See TracBrowser for help on using the repository browser.