source: main/trunk/greenstone2/perllib/plugins/GreenstoneMETSPlugin.pm@ 23484

Last change on this file since 23484 was 22840, checked in by davidb, 14 years ago

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, text that comes in from doctxt.xml (read with XPATH) is Unicode aware (can be added to $doc_obj directly with add_utf8_....). In the case of the metadata (read from docmets.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1###########################################################################
2#
3# GreenstoneMETSPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes GreenstoneArchive METS documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness). It's assumed that the GreenstoneArchive files conform
29# to their DTD.
30
31
32package GreenstoneMETSPlugin;
33
34use Encode;
35use ghtml;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40use ReadXMLFile;
41use XML::XPath;
42use XML::XPath::XMLParser;
43
44sub BEGIN {
45 @GreenstoneMETSPlugin::ISA = ('ReadXMLFile');
46}
47
48my $arguments = [ { 'name' => "process_exp",
49 'desc' => "{BasePlugin.process_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_process_exp()
53 }
54 ];
55
56my $options = { 'name' => "GreenstoneMETSPlugin",
57 'desc' => "{GreenstoneMETSPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62
63
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 return q^(?i)docmets\.xml$^;
68}
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 # have no args - do we still want this?
76 #push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
80
81 $self->{'section'} = "";
82 $self->{'section_level'} = 0;
83 $self->{'metadata_name'} = "";
84 $self->{'metadata_value'} = "";
85 $self->{'content'} = "";
86
87 return bless $self, $class;
88}
89
90sub xml_start_document {
91 my $self = shift (@_);
92 my ($expat, $element) = @_;
93
94 $self->{'section'} = "";
95 $self->{'section_level'} = 0;
96 $self->{'metadata_name'} = "";
97 $self->{'metadata_value'} = "";
98 $self->{'content'} = "";
99
100 #**defined a dmdSection Table
101 $self->{'dmdSec_table'}={};
102
103 #**defined a fileSection Table
104 $self->{'fileSec_table'}={};
105
106 #***open doctxt.xml and read the data in
107 my $filename = $self->{'filename'};
108
109 $filename =~ s/docmets.xml$/doctxt.xml/;
110
111 if (!open (FILEIN, "<:utf8", $filename)) {
112 print STDERR "Warning: unable to open the $filename\n";
113 $self->{'xmltxt'} = "";
114 }
115 else {
116 my $xml_text = "";
117 while (defined (my $line = <FILEIN>)) {
118 if ($line !~ m/^<!DOCTYPE.*>/) {
119 $xml_text .= $line;
120 }
121 }
122
123 my $xml_parser = XML::XPath->new (xml=> $xml_text);
124 #my $xml_tree = $xml_parser->parse ($xml_text);
125
126 #eval {$self->{'parser_text'}->parse};
127 $self->{'parsed_xml'} = $xml_parser;
128 }
129 my $outhandle = $self->{'outhandle'};
130 print $outhandle "GreenstoneMETSPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
131 print STDERR "<Processing n='$self->{'file'}' p='GreenstoneMETSPlugin'>\n" if ($self->{'gli'});
132
133}
134
135sub xml_end_document {
136}
137
138sub xml_doctype {
139}
140
141sub xml_start_tag {
142 my $self = shift(@_);
143 my ($expat, $element) = @_;
144 $self->{'element'} = $element;
145 #**deal with dmdSection
146 if ($element =~ /^(mets:)?dmdSec$/ || $element =~ /(gsdl3:)?Metadata$/){
147 $self->xml_dmd_start_tag (@_);
148 } elsif ($element =~ /^(mets:)?file$/) {
149 # only store the file_id for sections with text. Not for default ids (assoc files)
150 if ($_{'ID'} =~ m/FILE(.*)/) {
151 $self->{'file_Id'} = $1;
152 }
153 else {
154 undef $self->{'file_Id'};
155 }
156 } elsif ($element =~ /^(mets:)?FLocat$/){
157 #***deal with fileSection
158 $self->xml_fileloc_start_tag (@_);
159 } elsif ($element =~ /^(mets:)?div$/){
160 #***deal with StrucMap Section
161 $self->xml_strucMap_start_tag (@_);
162 }
163}
164
165sub xml_dmd_start_tag {
166 my $self = shift (@_);
167 my ($expat, $element) = @_;
168
169 if ($element =~ /^(mets:)?dmdSec$/){
170 my ($section_num) = ($_{'ID'} =~ m/DM(.*)/);
171 $self->{'dmdSec_table'}->{"$section_num"}=[];
172 $self->{'dmdSec_table'}->{'section_num'}=$section_num;
173 } elsif ($element =~ /^(gsdl3:)?Metadata$/) {
174 $self->{'metadata_name'} = $_{'name'};
175 }
176}
177
178sub xml_fileloc_start_tag {
179 my $self = shift (@_);
180 my ($expat, $element) = @_;
181
182 my $xlink = $_{'xlink:href'};
183 if (!defined $xlink) {
184 # try without namespace
185 $xlink = $_{'href'};
186 }
187 #my ($section_num) = ($_{'ID'} =~ m/^FLOCAT(.*)$/);
188 my $section_num = $self->{'file_Id'};
189 return if (!defined $section_num);
190
191 $self->{'fileSec_table'}->{"$section_num"}=[];
192 $self->{'fileSec_table'}->{'section_num'}=$section_num;
193
194 my ($filename,$xpath_expr)=($xlink =~ m/^file:(.*)\#xpointer\((.*)\)$/);
195 my $nodeset = $self->{'parsed_xml'}->findnodes ($xpath_expr);
196 my $node_size= $nodeset->size;
197
198 if ($node_size==0) {
199 print STDERR "Warning: no text associated with XPATH $xpath_expr\n";
200 }
201 else {
202 foreach my $node ($nodeset->get_nodelist) {
203 my $xml_content = XML::XPath::XMLParser::as_string($node);
204 my $unescaped_xml_content = &ghtml::unescape_html($xml_content);
205 my $section_content={'section_content'=> $unescaped_xml_content};
206
207 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
208 push (@$content_list, $section_content);
209 }
210 }
211}
212
213sub xml_strucMap_start_tag {
214 my $self = shift (@_);
215 my ($expat, $element) = @_;
216
217
218 my ($section_num) = ($_{'ID'} =~ m/DS(.*)/);
219
220 if ($_{'ID'} ne "DSAll"){
221 if ($self->{'section_level'}==0) {
222 $self->open_document();
223 } else {
224 my $doc_obj = $self->{'doc_obj'};
225 $self->{'section'}=
226 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
227 }
228 $self->{'section_level'}++;
229
230 #***Add metadata from dmdSection
231 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
232
233 foreach my $md_pair (@$md_list){
234 # text read in by XML::Parser is in Perl's binary byte value
235 # form ... need to explicitly make it UTF-8
236
237 my $metadata_name = decode("utf8",$md_pair->{'metadata_name'});
238 my $metadata_value = decode("utf8",$md_pair->{'metadata_value'});
239
240 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
241 $metadata_name, $metadata_value);
242 }
243
244 #*** Add content from fileSection
245 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
246
247 foreach my $section_content (@$content_list){
248 # Don't need to decode $content as this has been readin in
249 # through XPath which (unlike XML::Parser) correctly sets
250 # the string to be UTF8 rather than a 'binary' string of bytes
251 my $content = $section_content->{'section_content'};
252
253 $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content);
254 }
255 }
256}
257
258sub get_doctype {
259 my $self = shift(@_);
260
261 return "mets:mets";
262}
263
264sub xml_end_tag {
265 my $self = shift(@_);
266 my ($expat, $element) = @_;
267
268 if ($element =~ /^(gsdl3:)?Metadata$/) {
269 my $section_num = $self->{'dmdSec_table'}->{'section_num'};
270 my $metadata_name=$self->{'metadata_name'};
271 my $metadata_value=$self->{'metadata_value'};
272
273 my $md_pair={'metadata_name' => $metadata_name,
274 'metadata_value'=> $metadata_value};
275
276 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
277
278 push(@$md_list,$md_pair);
279
280 $self->{'metadata_name'} = "";
281 $self->{'metadata_value'} = "";
282 } elsif ($element =~ /^(mets:)?file$/){
283 $self->{'file_id'} = "";
284 }
285
286
287 #*** StrucMap Section
288 if ($element =~ /^(mets:)?div$/) {
289 $self->{'section_level'}--;
290 $self->{'section'} = $self->{'doc_obj'}->get_parent_section($self->{'section'});
291 $self->close_document() if $self->{'section_level'}==0;
292 }
293 $self->{'element'} = "";
294}
295
296sub xml_text {
297 my $self = shift(@_);
298 my ($expat) = @_;
299
300 if ($self->{'element'} =~ /^(gsdl3:)?Metadata$/) {
301 $self->{'metadata_value'} .= $_;
302 }
303}
304
305sub open_document {
306 my $self = shift(@_);
307
308 # create a new document
309 $self->{'doc_obj'} = new doc ();
310 $self->{'section'} = "";
311}
312
313sub close_document {
314 my $self = shift(@_);
315
316 # add the associated files
317 my $assoc_files =
318 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
319
320 # for when "assocfilepath" isn't the same directory that doc.xml is in...
321 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
322
323 my $assoc_filepath=shift (@$assoc_filepath_list);
324 if (defined ($assoc_filepath)) {
325 # make absolute rather than relative...
326 $self->{'filename'} =~ m@^(.*[\\/]archives)@;
327 $assoc_filepath = "$1/$assoc_filepath/";
328 } else {
329 $assoc_filepath = $self->{'filename'};
330 $assoc_filepath =~ s/[^\\\/]*$//;
331 }
332
333 foreach my $assoc_file_info (@$assoc_files) {
334 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
335 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
336 my $assoc_dir = (defined $dir && $dir ne "")
337 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
338 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
339 }
340 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
341
342 # process the document
343 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
344}
345
346
3471;
348
Note: See TracBrowser for help on using the repository browser.