source: trunk/gsdl/perllib/plugins/METSPlug.pm@ 11069

Last change on this file since 11069 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
RevLine 
[7901]1###########################################################################
2#
3# METSPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes GreenstoneArchive METS documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness). It's assumed that the GreenstoneArchive files conform
29# to their DTD.
30
31# 28/07/04 Modified to METSPlug - CHI-YU HUANG
32
33package METSPlug;
34
35use ghtml;
36
[10254]37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
[7901]40use XMLPlug;
41use XML::XPath;
42use XML::XPath::XMLParser;
43
44sub BEGIN {
[10218]45 @METSPlug::ISA = ('XMLPlug');
[7901]46}
47
[10254]48my $arguments = [
49 ];
[7901]50my $options = { 'name' => "METSPlug",
51 'desc' => "{METSPlug.desc}",
52 'abstract' => "no",
53 'inherits' => "yes" };
54
55
56sub get_default_process_exp {
57 my $self = shift (@_);
58
59 return q^(?i)docmets\.xml$^;
60}
61
62sub new {
[10218]63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
[7901]66
[10218]67 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
68 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[7901]69
[10218]70 my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
71
[7901]72 $self->{'section'} = "";
73 $self->{'section_level'} = 0;
74 $self->{'metadata_name'} = "";
75 $self->{'metadata_value'} = "";
76 $self->{'content'} = "";
77
78 return bless $self, $class;
79}
80
81sub xml_start_document {
82 my $self = shift (@_);
83 my ($expat, $element) = @_;
84
85 $self->{'section'} = "";
86 $self->{'section_level'} = 0;
87 $self->{'metadata_name'} = "";
88 $self->{'metadata_value'} = "";
[8514]89 $self->{'content'} = "";
[7901]90
91 #**defined a dmdSection Table
92 $self->{'dmdSec_table'}={};
93
94 #**defined a fileSection Table
95 $self->{'fileSec_table'}={};
96
97 #***open doctxt.xml and read the data in
98 my $filename = $self->{'filename'};
99
100 $filename =~ s/docmets.xml$/doctxt.xml/;
101
102 if (!open (FILEIN,"<$filename")){
103 print STDERR "Warning: unable to open the $filename\n";
104 $self->{'xmltxt'} = "";
105 }
106 else {
107 my $xml_text = "";
108 while (defined (my $line = <FILEIN>)) {
109 if ($line !~ m/^<!DOCTYPE.*>/) {
110 $xml_text .= $line;
111 }
112 }
113 my $xml_parser = XML::XPath->new (xml=> $xml_text);
114 #my $xml_tree = $xml_parser->parse ($xml_text);
115
116 #eval {$self->{'parser_text'}->parse};
[8740]117 $self->{'parsed_xml'} = $xml_parser;
[7901]118 }
[8740]119 my $outhandle = $self->{'outhandle'};
120 print $outhandle "METSPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
[9468]121 print STDERR "<Processing n='$self->{'file'}' p='METSPlug'>\n" if ($self->{'gli'});
122
[7901]123}
124
125sub xml_end_document {
126}
127
128sub xml_doctype {
129}
130
131sub xml_start_tag {
132 my $self = shift(@_);
133 my ($expat, $element) = @_;
134
135 $self->{'element'} = $element;
[8514]136
[7901]137 #**deal with dmdSection
[8514]138 if ($element eq "mets:dmdSec" || $element eq "gsdl3:Metadata"){
[7901]139 $self->xml_dmd_start_tag (@_);
[8740]140 } elsif ($element eq "mets:file") {
141 $_{'ID'} =~ m/FILE(.*)/;
142 $self->{'file_Id'} = $1;
[8121]143 } elsif ($element eq "mets:FLocat"){
[7901]144 #***deal with fileSection
[8740]145 $self->xml_fileloc_start_tag (@_);
[7901]146 } elsif ($element eq "mets:div"){
147 #***deal with StrucMap Section
148 $self->xml_strucMap_start_tag (@_);
149 }
150}
151
152sub xml_dmd_start_tag {
153 my $self = shift (@_);
154 my ($expat, $element) = @_;
155
156 if ($element eq "mets:dmdSec"){
157 my ($section_num) = ($_{'ID'} =~ m/DM(.*)/);
158 $self->{'dmdSec_table'}->{"$section_num"}=[];
159 $self->{'dmdSec_table'}->{'section_num'}=$section_num;
[8514]160 } elsif ($element eq "gsdl3:Metadata") {
[7901]161 $self->{'metadata_name'} = $_{'name'};
162 }
163}
164
[8740]165sub xml_fileloc_start_tag {
[7901]166 my $self = shift (@_);
167 my ($expat, $element) = @_;
168
169 my $xlink = $_{'xlink:href'};
[8740]170 #my ($section_num) = ($_{'ID'} =~ m/^FLOCAT(.*)$/);
171 my $section_num = $self->{'file_Id'};
[7901]172
173 return if (!defined $section_num);
174 #**return if the section_num is not defined or not deal with the whole section (ID="default.*")
175
176 $self->{'fileSec_table'}->{"$section_num"}=[];
177 $self->{'fileSec_table'}->{'section_num'}=$section_num;
178
179 my ($filename,$xpath_expr)=($xlink =~ m/^file:(.*)\#xpointer\((.*)\)$/);
180
181 my $nodeset = $self->{'parsed_xml'}->findnodes ($xpath_expr);
182 my $node_size= $nodeset->size;
183
184 if ($node_size==0) {
185 print STDERR "Warning: no text associated with XPATH $xpath_expr\n";
186 }
187 else {
188 foreach my $node ($nodeset->get_nodelist) {
189 my $xml_content = XML::XPath::XMLParser::as_string($node);
190 my $unescaped_xml_content = &ghtml::unescape_html($xml_content);
191
192 my $section_content={'section_content'=> $unescaped_xml_content};
193
194 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
195 push (@$content_list, $section_content);
196 }
197 }
198}
199
200sub xml_strucMap_start_tag {
201 my $self = shift (@_);
202 my ($expat, $element) = @_;
203
[8740]204
205 my ($section_num) = ($_{'ID'} =~ m/DS(.*)/);
206
207 if ($_{'ID'} ne "DSAll"){
[7901]208 if ($self->{'section_level'}==0) {
209 $self->open_document();
210 } else {
211 my $doc_obj = $self->{'doc_obj'};
212 $self->{'section'}=
213 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
214 }
215 $self->{'section_level'}++;
216
[8740]217 #***Add metadata from dmdSection
[7901]218 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
219
220 foreach my $md_pair (@$md_list){
221 my $metadata_name = $md_pair->{'metadata_name'};
222 my $metadata_value = $md_pair->{'metadata_value'};
223 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $metadata_name, $metadata_value);
224 }
225
[8740]226 #*** Add content from fileSection
[7901]227 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
[8740]228
[7901]229 foreach my $section_content (@$content_list){
230 my $content = $section_content->{'section_content'};
231 $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content);
[8740]232 }
[7901]233 }
234}
235
236sub xml_end_tag {
237 my $self = shift(@_);
238 my ($expat, $element) = @_;
239
[8514]240 if ($element eq "gsdl3:Metadata") {
[7901]241 my $section_num = $self->{'dmdSec_table'}->{'section_num'};
242 my $metadata_name=$self->{'metadata_name'};
243 my $metadata_value=$self->{'metadata_value'};
244
245 my $md_pair={'metadata_name' => $metadata_name,
246 'metadata_value'=> $metadata_value};
247
248 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
249
250 push(@$md_list,$md_pair);
251
252 $self->{'metadata_name'} = "";
253 $self->{'metadata_value'} = "";
[8740]254 } elsif ($element eq "mets:file"){
255 $self->{'file_id'} = "";
[7901]256 }
257
[8740]258
[7901]259 #*** StrucMap Section
260 if ($element eq "mets:div") {
261 $self->{'section_level'}--;
262 $self->{'section'} = $self->{'doc_obj'}->get_parent_section($self->{'section'});
263 $self->close_document() if $self->{'section_level'}==0;
264 }
265 $self->{'element'} = "";
266}
267
268sub xml_text {
269 my $self = shift(@_);
270 my ($expat) = @_;
271
[8514]272 if ($self->{'element'} eq "gsdl3:Metadata") {
[7901]273 $self->{'metadata_value'} .= $_;
274 }
275}
276
277sub open_document {
278 my $self = shift(@_);
279
280 # create a new document
281 $self->{'doc_obj'} = new doc ();
282 $self->{'section'} = "";
283}
284
285sub close_document {
286 my $self = shift(@_);
287
288 # add the associated files
289 my $assoc_files =
290 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
291
292 # for when "assocfilepath" isn't the same directory that doc.xml is in...
293 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
294
295 my $assoc_filepath=shift (@$assoc_filepath_list);
296 if (defined ($assoc_filepath)) {
297 # make absolute rather than relative...
298 $self->{'filename'} =~ m@^(.*[\\/]archives)@;
299 $assoc_filepath = "$1/$assoc_filepath/";
300 } else {
301 $assoc_filepath = $self->{'filename'};
302 $assoc_filepath =~ s/[^\\\/]*$//;
303 }
304
305 foreach my $assoc_file_info (@$assoc_files) {
306 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
307 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
308 my $assoc_dir = (defined $dir && $dir ne "")
309 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
310 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
311 }
312 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
313
314 # process the document
315 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
316}
317
318
3191;
320
Note: See TracBrowser for help on using the repository browser.