source: gsdl/trunk/perllib/plugins/METSPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
Line 
1###########################################################################
2#
3# METSPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes GreenstoneArchive METS documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness). It's assumed that the GreenstoneArchive files conform
29# to their DTD.
30
31
32package METSPlugin;
33
34use ghtml;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use ReadXMLFile;
40use XML::XPath;
41use XML::XPath::XMLParser;
42
43sub BEGIN {
44 @METSPlugin::ISA = ('ReadXMLFile');
45}
46
47my $arguments = [
48 ];
49my $options = { 'name' => "METSPlugin",
50 'desc' => "{METSPlugin.desc}",
51 'abstract' => "no",
52 'inherits' => "yes" };
53
54
55sub get_default_process_exp {
56 my $self = shift (@_);
57
58 return q^(?i)docmets\.xml$^;
59}
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 # have no args - do we still want this?
67 #push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
71
72 $self->{'section'} = "";
73 $self->{'section_level'} = 0;
74 $self->{'metadata_name'} = "";
75 $self->{'metadata_value'} = "";
76 $self->{'content'} = "";
77
78 return bless $self, $class;
79}
80
81sub xml_start_document {
82 my $self = shift (@_);
83 my ($expat, $element) = @_;
84
85 $self->{'section'} = "";
86 $self->{'section_level'} = 0;
87 $self->{'metadata_name'} = "";
88 $self->{'metadata_value'} = "";
89 $self->{'content'} = "";
90
91 #**defined a dmdSection Table
92 $self->{'dmdSec_table'}={};
93
94 #**defined a fileSection Table
95 $self->{'fileSec_table'}={};
96
97 #***open doctxt.xml and read the data in
98 my $filename = $self->{'filename'};
99
100 $filename =~ s/docmets.xml$/doctxt.xml/;
101
102 if (!open (FILEIN,"<$filename")){
103 print STDERR "Warning: unable to open the $filename\n";
104 $self->{'xmltxt'} = "";
105 }
106 else {
107 my $xml_text = "";
108 while (defined (my $line = <FILEIN>)) {
109 if ($line !~ m/^<!DOCTYPE.*>/) {
110 $xml_text .= $line;
111 }
112 }
113 my $xml_parser = XML::XPath->new (xml=> $xml_text);
114 #my $xml_tree = $xml_parser->parse ($xml_text);
115
116 #eval {$self->{'parser_text'}->parse};
117 $self->{'parsed_xml'} = $xml_parser;
118 }
119 my $outhandle = $self->{'outhandle'};
120 print $outhandle "METSPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
121 print STDERR "<Processing n='$self->{'file'}' p='METSPlugin'>\n" if ($self->{'gli'});
122
123}
124
125sub xml_end_document {
126}
127
128sub xml_doctype {
129}
130
131sub xml_start_tag {
132 my $self = shift(@_);
133 my ($expat, $element) = @_;
134
135 $self->{'element'} = $element;
136
137 #**deal with dmdSection
138 if ($element eq "mets:dmdSec" || $element eq "gsdl3:Metadata"){
139 $self->xml_dmd_start_tag (@_);
140 } elsif ($element eq "mets:file") {
141 $_{'ID'} =~ m/FILE(.*)/;
142 $self->{'file_Id'} = $1;
143 } elsif ($element eq "mets:FLocat"){
144 #***deal with fileSection
145 $self->xml_fileloc_start_tag (@_);
146 } elsif ($element eq "mets:div"){
147 #***deal with StrucMap Section
148 $self->xml_strucMap_start_tag (@_);
149 }
150}
151
152sub xml_dmd_start_tag {
153 my $self = shift (@_);
154 my ($expat, $element) = @_;
155
156 if ($element eq "mets:dmdSec"){
157 my ($section_num) = ($_{'ID'} =~ m/DM(.*)/);
158 $self->{'dmdSec_table'}->{"$section_num"}=[];
159 $self->{'dmdSec_table'}->{'section_num'}=$section_num;
160 } elsif ($element eq "gsdl3:Metadata") {
161 $self->{'metadata_name'} = $_{'name'};
162 }
163}
164
165sub xml_fileloc_start_tag {
166 my $self = shift (@_);
167 my ($expat, $element) = @_;
168
169 my $xlink = $_{'xlink:href'};
170 #my ($section_num) = ($_{'ID'} =~ m/^FLOCAT(.*)$/);
171 my $section_num = $self->{'file_Id'};
172
173 return if (!defined $section_num);
174 #**return if the section_num is not defined or not deal with the whole section (ID="default.*")
175
176 $self->{'fileSec_table'}->{"$section_num"}=[];
177 $self->{'fileSec_table'}->{'section_num'}=$section_num;
178
179 my ($filename,$xpath_expr)=($xlink =~ m/^file:(.*)\#xpointer\((.*)\)$/);
180
181 my $nodeset = $self->{'parsed_xml'}->findnodes ($xpath_expr);
182 my $node_size= $nodeset->size;
183
184 if ($node_size==0) {
185 print STDERR "Warning: no text associated with XPATH $xpath_expr\n";
186 }
187 else {
188 foreach my $node ($nodeset->get_nodelist) {
189 my $xml_content = XML::XPath::XMLParser::as_string($node);
190 my $unescaped_xml_content = &ghtml::unescape_html($xml_content);
191
192 my $section_content={'section_content'=> $unescaped_xml_content};
193
194 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
195 push (@$content_list, $section_content);
196 }
197 }
198}
199
200sub xml_strucMap_start_tag {
201 my $self = shift (@_);
202 my ($expat, $element) = @_;
203
204
205 my ($section_num) = ($_{'ID'} =~ m/DS(.*)/);
206
207 if ($_{'ID'} ne "DSAll"){
208 if ($self->{'section_level'}==0) {
209 $self->open_document();
210 } else {
211 my $doc_obj = $self->{'doc_obj'};
212 $self->{'section'}=
213 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
214 }
215 $self->{'section_level'}++;
216
217 #***Add metadata from dmdSection
218 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
219
220 foreach my $md_pair (@$md_list){
221 my $metadata_name = $md_pair->{'metadata_name'};
222 my $metadata_value = $md_pair->{'metadata_value'};
223 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $metadata_name, $metadata_value);
224 }
225
226 #*** Add content from fileSection
227 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
228
229 foreach my $section_content (@$content_list){
230 my $content = $section_content->{'section_content'};
231 $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content);
232 }
233 }
234}
235
236sub get_doctype {
237 my $self = shift(@_);
238
239 return "mets:mets";
240}
241
242sub xml_end_tag {
243 my $self = shift(@_);
244 my ($expat, $element) = @_;
245
246 if ($element eq "gsdl3:Metadata") {
247 my $section_num = $self->{'dmdSec_table'}->{'section_num'};
248 my $metadata_name=$self->{'metadata_name'};
249 my $metadata_value=$self->{'metadata_value'};
250
251 my $md_pair={'metadata_name' => $metadata_name,
252 'metadata_value'=> $metadata_value};
253
254 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
255
256 push(@$md_list,$md_pair);
257
258 $self->{'metadata_name'} = "";
259 $self->{'metadata_value'} = "";
260 } elsif ($element eq "mets:file"){
261 $self->{'file_id'} = "";
262 }
263
264
265 #*** StrucMap Section
266 if ($element eq "mets:div") {
267 $self->{'section_level'}--;
268 $self->{'section'} = $self->{'doc_obj'}->get_parent_section($self->{'section'});
269 $self->close_document() if $self->{'section_level'}==0;
270 }
271 $self->{'element'} = "";
272}
273
274sub xml_text {
275 my $self = shift(@_);
276 my ($expat) = @_;
277
278 if ($self->{'element'} eq "gsdl3:Metadata") {
279 $self->{'metadata_value'} .= $_;
280 }
281}
282
283sub open_document {
284 my $self = shift(@_);
285
286 # create a new document
287 $self->{'doc_obj'} = new doc ();
288 $self->{'section'} = "";
289}
290
291sub close_document {
292 my $self = shift(@_);
293
294 # add the associated files
295 my $assoc_files =
296 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
297
298 # for when "assocfilepath" isn't the same directory that doc.xml is in...
299 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
300
301 my $assoc_filepath=shift (@$assoc_filepath_list);
302 if (defined ($assoc_filepath)) {
303 # make absolute rather than relative...
304 $self->{'filename'} =~ m@^(.*[\\/]archives)@;
305 $assoc_filepath = "$1/$assoc_filepath/";
306 } else {
307 $assoc_filepath = $self->{'filename'};
308 $assoc_filepath =~ s/[^\\\/]*$//;
309 }
310
311 foreach my $assoc_file_info (@$assoc_files) {
312 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
313 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
314 my $assoc_dir = (defined $dir && $dir ne "")
315 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
316 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
317 }
318 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
319
320 # process the document
321 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
322}
323
324
3251;
326
Note: See TracBrowser for help on using the repository browser.