source: trunk/gsdl/perllib/plugins/METSPlug.pm@ 7911

Last change on this file since 7911 was 7901, checked in by chi, 20 years ago

Greenstone2 now supports METS format as an archiving option.
'import.pl' can be run with '-saveas METS' and METS format files
will be generated in the archive directory instead of GA format.
To read in these METS files using 'buildcol.pl' specify METSPlug
in your configuration file.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.8 KB
Line 
1###########################################################################
2#
3# METSPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes GreenstoneArchive METS documents. Note that this plugin does no
27# syntax checking (though the XML::Parser module tests for
28# well-formedness). It's assumed that the GreenstoneArchive files conform
29# to their DTD.
30
31# 28/07/04 Modified to METSPlug - CHI-YU HUANG
32
33package METSPlug;
34
35use ghtml;
36
37use XMLPlug;
38use XML::XPath;
39use XML::XPath::XMLParser;
40
41sub BEGIN {
42 @ISA = ('XMLPlug');
43}
44
45my $options = { 'name' => "METSPlug",
46 'desc' => "{METSPlug.desc}",
47 'abstract' => "no",
48 'inherits' => "yes" };
49
50
51sub get_default_process_exp {
52 my $self = shift (@_);
53
54 return q^(?i)docmets\.xml$^;
55}
56
57sub new {
58 my $class = shift (@_);
59 my $self = new XMLPlug ($class, @_);
60
61 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
62 my $option_list = $self->{'option_list'};
63 push( @{$option_list}, $options );
64
65 $self->{'section'} = "";
66 $self->{'section_level'} = 0;
67 $self->{'metadata_name'} = "";
68 $self->{'metadata_value'} = "";
69 $self->{'content'} = "";
70
71 return bless $self, $class;
72}
73
74sub xml_start_document {
75 my $self = shift (@_);
76 my ($expat, $element) = @_;
77
78 $self->{'section'} = "";
79 $self->{'section_level'} = 0;
80 $self->{'metadata_name'} = "";
81 $self->{'metadata_value'} = "";
82 $self->{'content'}="";
83
84 #**defined a dmdSection Table
85 $self->{'dmdSec_table'}={};
86
87 #**defined a fileSection Table
88 $self->{'fileSec_table'}={};
89
90 #***open doctxt.xml and read the data in
91 my $filename = $self->{'filename'};
92
93 $filename =~ s/docmets.xml$/doctxt.xml/;
94
95 if (!open (FILEIN,"<$filename")){
96 print STDERR "Warning: unable to open the $filename\n";
97 $self->{'xmltxt'} = "";
98 }
99 else {
100 my $xml_text = "";
101 while (defined (my $line = <FILEIN>)) {
102 if ($line !~ m/^<!DOCTYPE.*>/) {
103 $xml_text .= $line;
104 }
105 }
106 my $xml_parser = XML::XPath->new (xml=> $xml_text);
107 #my $xml_tree = $xml_parser->parse ($xml_text);
108
109 #eval {$self->{'parser_text'}->parse};
110 $self->{'parsed_xml'} = $xml_parser;
111 }
112}
113
114sub xml_end_document {
115}
116
117sub xml_doctype {
118 my $self = shift(@_);
119
120 my ($expat, $name, $sysid, $pubid, $internal) = @_;
121
122 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
123 # as well as the "Archive" files which should now be created by import.pl
124 die "" if ($name !~ /^(Greenstone)?Archive$/);
125
126 my $outhandle = $self->{'outhandle'};
127 print $outhandle "METSPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
128}
129
130sub xml_start_tag {
131 my $self = shift(@_);
132 my ($expat, $element) = @_;
133
134 $self->{'element'} = $element;
135
136 #**deal with dmdSection
137 if ($element eq "mets:dmdSec" || $element eq "gsdl:Metadata"){
138 $self->xml_dmd_start_tag (@_);
139 } elsif ($element eq "mets:FLocate"){
140 #***deal with fileSection
141 $self->xml_file_start_tag (@_);
142 } elsif ($element eq "mets:div"){
143 #***deal with StrucMap Section
144 $self->xml_strucMap_start_tag (@_);
145 }
146}
147
148sub xml_dmd_start_tag {
149 my $self = shift (@_);
150 my ($expat, $element) = @_;
151
152 if ($element eq "mets:dmdSec"){
153 my ($section_num) = ($_{'ID'} =~ m/DM(.*)/);
154 $self->{'dmdSec_table'}->{"$section_num"}=[];
155 $self->{'dmdSec_table'}->{'section_num'}=$section_num;
156
157 } elsif ($element eq "gsdl:Metadata") {
158 $self->{'metadata_name'} = $_{'name'};
159 }
160}
161
162sub xml_file_start_tag {
163 my $self = shift (@_);
164 my ($expat, $element) = @_;
165
166 my $xlink = $_{'xlink:href'};
167 my ($section_num) = ($_{'ID'} =~ m/^FILE(.*)$/);
168
169 return if (!defined $section_num);
170 #**return if the section_num is not defined or not deal with the whole section (ID="default.*")
171
172 $self->{'fileSec_table'}->{"$section_num"}=[];
173 $self->{'fileSec_table'}->{'section_num'}=$section_num;
174
175 my ($filename,$xpath_expr)=($xlink =~ m/^file:(.*)\#xpointer\((.*)\)$/);
176
177 my $nodeset = $self->{'parsed_xml'}->findnodes ($xpath_expr);
178 my $node_size= $nodeset->size;
179
180 if ($node_size==0) {
181 print STDERR "Warning: no text associated with XPATH $xpath_expr\n";
182 }
183 else {
184 foreach my $node ($nodeset->get_nodelist) {
185 my $xml_content = XML::XPath::XMLParser::as_string($node);
186 my $unescaped_xml_content = &ghtml::unescape_html($xml_content);
187
188 my $section_content={'section_content'=> $unescaped_xml_content};
189
190 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
191 push (@$content_list, $section_content);
192 #print STDERR "###Adding the content=$xml_content\n";
193 }
194 }
195}
196
197sub xml_strucMap_start_tag {
198 my $self = shift (@_);
199 my ($expat, $element) = @_;
200
201 if ($_{'ID'} ne "All") {
202 my ($section_num) = ($_{'ID'} =~ m/DS(.*)/);
203
204 if ($self->{'section_level'}==0) {
205 $self->open_document();
206 } else {
207 my $doc_obj = $self->{'doc_obj'};
208 $self->{'section'}=
209 $doc_obj->insert_section($doc_obj->get_end_child($self->{'section'}));
210 }
211 $self->{'section_level'}++;
212
213 #***Add metadata from dmdSection
214 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
215
216 foreach my $md_pair (@$md_list){
217 my $metadata_name = $md_pair->{'metadata_name'};
218 my $metadata_value = $md_pair->{'metadata_value'};
219 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $metadata_name, $metadata_value);
220 }
221
222 #*** Add content from fileSection
223 my $content_list = $self->{'fileSec_table'}->{"$section_num"};
224
225 foreach my $section_content (@$content_list){
226 my $content = $section_content->{'section_content'};
227 $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content);
228 }
229 }
230}
231
232sub xml_end_tag {
233 my $self = shift(@_);
234 my ($expat, $element) = @_;
235
236 if ($element eq "gsdl:Metadata") {
237 my $section_num = $self->{'dmdSec_table'}->{'section_num'};
238 my $metadata_name=$self->{'metadata_name'};
239 my $metadata_value=$self->{'metadata_value'};
240
241 my $md_pair={'metadata_name' => $metadata_name,
242 'metadata_value'=> $metadata_value};
243
244 my $md_list = $self->{'dmdSec_table'}->{"$section_num"};
245
246 push(@$md_list,$md_pair);
247
248 $self->{'metadata_name'} = "";
249 $self->{'metadata_value'} = "";
250 }
251
252 #*** StrucMap Section
253 if ($element eq "mets:div") {
254 $self->{'section_level'}--;
255 $self->{'section'} = $self->{'doc_obj'}->get_parent_section($self->{'section'});
256 $self->close_document() if $self->{'section_level'}==0;
257 }
258 $self->{'element'} = "";
259}
260
261sub xml_text {
262 my $self = shift(@_);
263 my ($expat) = @_;
264
265 if ($self->{'element'} eq "gsdl:Metadata") {
266 $self->{'metadata_value'} .= $_;
267 }
268}
269
270sub open_document {
271 my $self = shift(@_);
272
273 # create a new document
274 $self->{'doc_obj'} = new doc ();
275 $self->{'section'} = "";
276}
277
278sub close_document {
279 my $self = shift(@_);
280
281 # add the associated files
282 my $assoc_files =
283 $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
284
285 # for when "assocfilepath" isn't the same directory that doc.xml is in...
286 my $assoc_filepath_list= $self->{'doc_obj'}->get_metadata($self->{'doc_obj'}->get_top_section(), "assocfilepath");
287
288 my $assoc_filepath=shift (@$assoc_filepath_list);
289 if (defined ($assoc_filepath)) {
290 # make absolute rather than relative...
291 $self->{'filename'} =~ m@^(.*[\\/]archives)@;
292 $assoc_filepath = "$1/$assoc_filepath/";
293 } else {
294 $assoc_filepath = $self->{'filename'};
295 $assoc_filepath =~ s/[^\\\/]*$//;
296 }
297
298 foreach my $assoc_file_info (@$assoc_files) {
299 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
300 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
301 my $assoc_dir = (defined $dir && $dir ne "")
302 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
303 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
304 }
305 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
306
307 # process the document
308 $self->{'processor'}->process($self->{'doc_obj'}, $self->{'file'});
309}
310
311
3121;
313
Note: See TracBrowser for help on using the repository browser.