source: trunk/gsdl/perllib/plugins/BNContentePlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1###########################################################################
2#
3# BNContentePlug.pm -- plugin for import the BN-Portugal Collection
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# BNContentePlug - 11/2004
27#
28#
29# This plugin takes "mets.xml" and "record/NCB_***.xml: the file contain MARC details
30# about BN-Portugal ccllection. The intension is to import such a collection into GS2.
31
32package BNContentePlug;
33
34use BasPlug;
35use plugin;
36#use ghtml;
37use XMLParser;
38use XML::Parser;
39
40sub BEGIN {
41 @BNContentePlug::ISA = ('BasPlug');
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
43}
44
45my $arguments =
46 [ { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "string",
49 'deft' => &get_default_process_exp(),
50 'reqd' => "no" },
51 { 'name' => "only_first_doc",
52 'desc' => "{BNContentePlug.only_first_doc}",
53 'type' => "flag",
54 'reqd' => "no" },
55 { 'name' => "first_inoder_ext",
56 'desc' => "{BNContentePlug.first_inorder_ext}",
57 'type' => "flag",
58 'reqd' => "no" },
59 { 'name' => "first_inorder_mime",
60 'desc' => "{BNContentePlug.first_inorder_mime}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "block_exp",
64 'desc' => "{BasPlug.block_exp}",
65 'type' => "string",
66 'deft' => &get_default_block_exp(),
67 'reqd' => "no" }];
68
69my $options = { 'name' => "BNContentePlug",
70 'desc' => "{BNContentePlug.desc}",
71 'inherits' => "yes",
72 'args' => $arguments };
73
74# Important variation to regular plugin structure. Need to desclare
75# $self as global variable to file so XMLParser callback routines
76# can access the content of the object.
77my ($self);
78
79sub get_default_process_exp {
80 my $self = shift (@_);
81
82 return q^(?i)(metsHTML\.xml)$^;
83}
84
85# block files
86sub get_default_block_exp {
87 my $self = shift (@_);
88
89 # Block all files besides contents
90 #return q^(?i)(metsHTML\.xml|)$^;
91 return q^(?i)((.*?)\.(.*?))$^;
92}
93
94sub new {
95 my ($class) = shift (@_);
96 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
97 push(@$pluginlist, $class);
98
99 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
100 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
101
102 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
103
104 #create XML::Parser object for parsing metsHTML.xml, NCB_???.xml files
105 my $mets_parser = new XML::Parser('Style' => 'Stream',
106 'Handlers' => {'Doctype' => \&METS_Doctype,
107 'Start' => \&METS_StartTag,
108 'End' => \&METS_EndTag
109 });
110
111 my $marc_parser = new XML::Parser('Style' => 'Stream',
112 'Handlers' => {'Char' => \&Char,
113 'Doctype' => \&MARC_Doctype,
114 'Start' => \&MARC_StartTag,
115 'End' => \&MARC_EndTag
116 });
117 $self->{'mets_parser'} = $mets_parser;
118 $self->{'marc_parser'} = $marc_parser;
119
120 $self->{'index_file'} = "";
121
122 return bless $self, $class;
123}
124
125sub read_marc_content {
126 my $self = shift (@_);
127 my ($marc_file) = @_;
128
129 # parse the Marc_file: NCB_???.xml
130 eval{
131 $self->{'marc_parser'}->parsefile($marc_file);
132 };
133
134 if ($@) {
135 die "BNContentePlug: ERROR $marc_file is not a well formed XML file ($@)\n";
136 }
137}
138
139# Read metsHTML.xml from BN-Portugal collection
140sub metadata_read {
141 my $self = shift (@_);
142 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
143
144 my $outhandle = $self->{'outhandle'};
145
146 my $filename = &util::filename_cat($base_dir, $file);
147
148 if ($filename !~ /metsHTML\.xml$/ || !-f $filename) {
149 if ($filename =~ /\.xml$/i || $filename =~ /log\.txt$/i || $filename =~ /isbd\.html$/i) {
150 $self->{'file_blocks'}->{$filename}=1;
151 }
152 return undef;
153 } else {
154 $self->{'file_blocks'}->{$filename}=1;
155 }
156
157 print $outhandle "BNContentePlug: extracting metadata from $filename\n"
158 if $self->{'verbosity'} > 1;
159
160 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
161 $self->{'dir'} = $dir;
162
163 eval {
164 $self->{'mets_parser'}->parsefile($filename);
165 };
166
167 if ($@) {
168 die "BNContentePlug: ERROR $filename is not a well formed XML file ($@)\n";
169 }
170
171 # read NCB_???.xml to parse MARC records and save as metadata
172 my $marc_file = &util::filename_cat($dir,$self->{'marc_file'});
173 $self->read_marc_content ($marc_file);
174
175 if (defined $self->{'index_file'} && $self->{'index_file'} ne "") {
176 my $index_file = $self->{'index_file'};
177 push(@$extrametakeys,$index_file);
178 $extrametadata->{$index_file} = $self->{'saved_metadata'};
179 } else {
180 print STDERR "####Warning can't find main index file\n";
181 }
182 return 1;
183}
184
185# The BNContentePlug read() function. This function does all the right things
186# to make general options work for a given plugin. It calls the process()
187# function which does all the work specific to a plugin (like the old
188# read functions used to do). Most plugins should define their own
189# process() function and let this read() function keep control.
190#
191# Return number of files processed, undef if can't process
192# Note that $base_dir might be "" and that $file might
193# include directories
194
195sub readxxx {
196 my $self = shift (@_);
197 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
198 my $outhandle = $self->{'outhandle'};
199
200 my $filename = &util::filename_cat($base_dir, $file);
201 #return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
202
203 #return 0 if ($filename =~ /metsHTML\.xml$/);
204 return 0 if ($filename =~ /\.xml$/);
205 return 0 if (defined $self->{'file_blocks'}->{'filename'});
206
207 return undef;
208}
209
210# do plugin specific processing of doc_obj
211sub process {
212 my $self = shift (@_);
213 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
214 my $outhandle = $self->{'outhandle'};
215
216 return 1;
217}
218
219sub METS_Doctype {
220 my ($expat, $name, $sysid, $pubid, $internal) = @_;
221 die if ($name !~ /^metsHTML\.xml$/);
222}
223
224sub MARC_Doctype {
225 my ($expat, $name, $sysid, $pubid, $internal) = @_;
226 print STDERR "###MARC Name =$name\n";
227 #die if ($name !~ /^metsHTML\.xml$/);
228 #die if (!$name);
229}
230
231sub METS_StartTag {
232 my ($expat, $element, %attr) = @_;
233 my @file_blocks;
234 if ($element eq "dmdSec") {
235 $self->{'marc_file'} = "";
236 } elsif ($element eq "mdRef") {
237 my $marc_href = $attr{'xlink:href'};
238 $self->{'marc_file'} = $marc_href;
239 } elsif ($element eq "FLocat"){
240 my $assocfiles = $attr{'xlink:href'};
241 if ($assocfiles =~ /index\.html$/) {
242 my $index_file = &util::filename_cat($self->{'dir'}, $assocfiles);
243 $self->{'index_file'} = $index_file;
244 } else {
245 my $link = &util::filename_cat($self->{'dir'}, $assocfiles);
246 #$self->{'file_blocks'}->{$link} = 1;
247 }
248 }
249}
250
251sub METS_EndTag {
252 my ($expat, $element, %attr) = @_;
253}
254
255sub MARC_StartTag {
256 my ($expat, $element, %attr) = @_;
257
258 if ($element eq "record") {
259 $self->{'saved_metadata'} = {};
260 }
261 elsif ($element eq "datafield") {
262 $self->{'metaname'} = $element;
263 $self->{'datafield'} = $attr{'tag'};
264 }
265 elsif ($element eq "subfield") {
266 $self->{'subfield'} = $attr{'code'};
267 $self->{'text'} = "";
268 }
269}
270
271sub MARC_EndTag {
272 my ($expat, $element) = @_;
273
274 if ($element eq "datafield") {
275 $self->{'metaname'} = "";
276 }
277 elsif ($element eq "subfield") {
278 my $mvalue = $self->{'text'};
279 my $mname = $self->{'datafield'}."^".$self->{'subfield'};
280 #print STDERR "**** $mname = $mvalue\n";
281
282 $mvalue =~ s/\[/&\#91;/g;
283 $mvalue =~ s/\[/&\#93;/g;
284
285
286 if (defined $self->{'saved_metadata'}->{$mname}) {
287 # accumulate - add value to existing value(s)
288 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
289 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
290 } else {
291 $self->{'saved_metadata'}->{$mname} =
292 [$self->{'saved_metadata'}->{$mname}, $mvalue];
293 }
294 } else {
295 # accumulate - add value into (currently empty) array
296 $self->{'saved_metadata'}->{$mname} = [$mvalue];
297 }
298 # store something here
299 $self->{'subfield'} = "";
300 $self->{'text'} = "";
301 }
302
303}
304
305
306# This Char function overrides the one in XML::Parser::Stream to overcome a
307# problem where $expat->{Text} is treated as the return value, slowing
308# things down significantly in some cases.
309sub Char {
310 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
311 $_[0]->{'Text'} .= $_[1];
312 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
313 $self->{'text'} .= $_[1];
314 }
315 return undef;
316}
317
3181;
Note: See TracBrowser for help on using the repository browser.