source: trunk/gsdl/perllib/plugins/BNContentePlug.pm@ 10606

Last change on this file since 10606 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1###########################################################################
2#
3# BNContentePlug.pm -- plugin for import the BN-Portugal Collection
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# BNContentePlug - 11/2004
27#
28#
29# This plugin takes "mets.xml" and "record/NCB_***.xml: the file contain MARC details
30# about BN-Portugal ccllection. The intension is to import such a collection into GS2.
31
32package BNContentePlug;
33
34use BasPlug;
35use plugin;
36#use ghtml;
37use XMLParser;
38use XML::Parser;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43sub BEGIN {
44 @BNContentePlug::ISA = ('BasPlug');
45 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
46}
47
48my $arguments =
49 [ { 'name' => "process_exp",
50 'desc' => "{BasPlug.process_exp}",
51 'type' => "string",
52 'deft' => &get_default_process_exp(),
53 'reqd' => "no" },
54 { 'name' => "only_first_doc",
55 'desc' => "{BNContentePlug.only_first_doc}",
56 'type' => "flag",
57 'reqd' => "no" },
58 { 'name' => "first_inoder_ext",
59 'desc' => "{BNContentePlug.first_inorder_ext}",
60 'type' => "flag",
61 'reqd' => "no" },
62 { 'name' => "first_inorder_mime",
63 'desc' => "{BNContentePlug.first_inorder_mime}",
64 'type' => "flag",
65 'reqd' => "no" },
66 { 'name' => "block_exp",
67 'desc' => "{BasPlug.block_exp}",
68 'type' => "string",
69 'deft' => &get_default_block_exp(),
70 'reqd' => "no" }];
71
72my $options = { 'name' => "BNContentePlug",
73 'desc' => "{BNContentePlug.desc}",
74 'inherits' => "yes",
75 'args' => $arguments };
76
77# Important variation to regular plugin structure. Need to desclare
78# $self as global variable to file so XMLParser callback routines
79# can access the content of the object.
80my ($self);
81
82sub get_default_process_exp {
83 my $self = shift (@_);
84
85 return q^(?i)(metsHTML\.xml)$^;
86}
87
88# block files
89sub get_default_block_exp {
90 my $self = shift (@_);
91
92 # Block all files besides contents
93 #return q^(?i)(metsHTML\.xml|)$^;
94 return q^(?i)((.*?)\.(.*?))$^;
95}
96
97sub new {
98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
101
102 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
104
105 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
106
107 #create XML::Parser object for parsing metsHTML.xml, NCB_???.xml files
108 my $mets_parser = new XML::Parser('Style' => 'Stream',
109 'Handlers' => {'Doctype' => \&METS_Doctype,
110 'Start' => \&METS_StartTag,
111 'End' => \&METS_EndTag
112 });
113
114 my $marc_parser = new XML::Parser('Style' => 'Stream',
115 'Handlers' => {'Char' => \&Char,
116 'Doctype' => \&MARC_Doctype,
117 'Start' => \&MARC_StartTag,
118 'End' => \&MARC_EndTag
119 });
120 $self->{'mets_parser'} = $mets_parser;
121 $self->{'marc_parser'} = $marc_parser;
122
123 $self->{'index_file'} = "";
124
125 return bless $self, $class;
126}
127
128sub read_marc_content {
129 my $self = shift (@_);
130 my ($marc_file) = @_;
131
132 # parse the Marc_file: NCB_???.xml
133 eval{
134 $self->{'marc_parser'}->parsefile($marc_file);
135 };
136
137 if ($@) {
138 die "BNContentePlug: ERROR $marc_file is not a well formed XML file ($@)\n";
139 }
140}
141
142# Read metsHTML.xml from BN-Portugal collection
143sub metadata_read {
144 my $self = shift (@_);
145 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
146
147 my $outhandle = $self->{'outhandle'};
148
149 my $filename = &util::filename_cat($base_dir, $file);
150
151 if ($filename !~ /metsHTML\.xml$/ || !-f $filename) {
152 if ($filename =~ /\.xml$/i || $filename =~ /log\.txt$/i || $filename =~ /isbd\.html$/i) {
153 $self->{'file_blocks'}->{$filename}=1;
154 }
155 return undef;
156 } else {
157 $self->{'file_blocks'}->{$filename}=1;
158 }
159
160 print $outhandle "BNContentePlug: extracting metadata from $filename\n"
161 if $self->{'verbosity'} > 1;
162
163 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
164 $self->{'dir'} = $dir;
165
166 eval {
167 $self->{'mets_parser'}->parsefile($filename);
168 };
169
170 if ($@) {
171 die "BNContentePlug: ERROR $filename is not a well formed XML file ($@)\n";
172 }
173
174 # read NCB_???.xml to parse MARC records and save as metadata
175 my $marc_file = &util::filename_cat($dir,$self->{'marc_file'});
176 $self->read_marc_content ($marc_file);
177
178 if (defined $self->{'index_file'} && $self->{'index_file'} ne "") {
179 my $index_file = $self->{'index_file'};
180 push(@$extrametakeys,$index_file);
181 $extrametadata->{$index_file} = $self->{'saved_metadata'};
182 } else {
183 print STDERR "####Warning can't find main index file\n";
184 }
185 return 1;
186}
187
188# The BNContentePlug read() function. This function does all the right things
189# to make general options work for a given plugin. It calls the process()
190# function which does all the work specific to a plugin (like the old
191# read functions used to do). Most plugins should define their own
192# process() function and let this read() function keep control.
193#
194# Return number of files processed, undef if can't process
195# Note that $base_dir might be "" and that $file might
196# include directories
197
198sub readxxx {
199 my $self = shift (@_);
200 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
201 my $outhandle = $self->{'outhandle'};
202
203 my $filename = &util::filename_cat($base_dir, $file);
204 #return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
205
206 #return 0 if ($filename =~ /metsHTML\.xml$/);
207 return 0 if ($filename =~ /\.xml$/);
208 return 0 if (defined $self->{'file_blocks'}->{'filename'});
209
210 return undef;
211}
212
213# do plugin specific processing of doc_obj
214sub process {
215 my $self = shift (@_);
216 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
217 my $outhandle = $self->{'outhandle'};
218
219 return 1;
220}
221
222sub METS_Doctype {
223 my ($expat, $name, $sysid, $pubid, $internal) = @_;
224 die if ($name !~ /^metsHTML\.xml$/);
225}
226
227sub MARC_Doctype {
228 my ($expat, $name, $sysid, $pubid, $internal) = @_;
229 print STDERR "###MARC Name =$name\n";
230 #die if ($name !~ /^metsHTML\.xml$/);
231 #die if (!$name);
232}
233
234sub METS_StartTag {
235 my ($expat, $element, %attr) = @_;
236 my @file_blocks;
237 if ($element eq "dmdSec") {
238 $self->{'marc_file'} = "";
239 } elsif ($element eq "mdRef") {
240 my $marc_href = $attr{'xlink:href'};
241 $self->{'marc_file'} = $marc_href;
242 } elsif ($element eq "FLocat"){
243 my $assocfiles = $attr{'xlink:href'};
244 if ($assocfiles =~ /index\.html$/) {
245 my $index_file = &util::filename_cat($self->{'dir'}, $assocfiles);
246 $self->{'index_file'} = $index_file;
247 } else {
248 my $link = &util::filename_cat($self->{'dir'}, $assocfiles);
249 #$self->{'file_blocks'}->{$link} = 1;
250 }
251 }
252}
253
254sub METS_EndTag {
255 my ($expat, $element, %attr) = @_;
256}
257
258sub MARC_StartTag {
259 my ($expat, $element, %attr) = @_;
260
261 if ($element eq "record") {
262 $self->{'saved_metadata'} = {};
263 }
264 elsif ($element eq "datafield") {
265 $self->{'metaname'} = $element;
266 $self->{'datafield'} = $attr{'tag'};
267 }
268 elsif ($element eq "subfield") {
269 $self->{'subfield'} = $attr{'code'};
270 $self->{'text'} = "";
271 }
272}
273
274sub MARC_EndTag {
275 my ($expat, $element) = @_;
276
277 if ($element eq "datafield") {
278 $self->{'metaname'} = "";
279 }
280 elsif ($element eq "subfield") {
281 my $mvalue = $self->{'text'};
282 my $mname = $self->{'datafield'}."^".$self->{'subfield'};
283 #print STDERR "**** $mname = $mvalue\n";
284
285 $mvalue =~ s/\[/&\#91;/g;
286 $mvalue =~ s/\[/&\#93;/g;
287
288
289 if (defined $self->{'saved_metadata'}->{$mname}) {
290 # accumulate - add value to existing value(s)
291 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
292 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
293 } else {
294 $self->{'saved_metadata'}->{$mname} =
295 [$self->{'saved_metadata'}->{$mname}, $mvalue];
296 }
297 } else {
298 # accumulate - add value into (currently empty) array
299 $self->{'saved_metadata'}->{$mname} = [$mvalue];
300 }
301 # store something here
302 $self->{'subfield'} = "";
303 $self->{'text'} = "";
304 }
305
306}
307
308
309# This Char function overrides the one in XML::Parser::Stream to overcome a
310# problem where $expat->{Text} is treated as the return value, slowing
311# things down significantly in some cases.
312sub Char {
313 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
314 $_[0]->{'Text'} .= $_[1];
315 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
316 $self->{'text'} .= $_[1];
317 }
318 return undef;
319}
320
3211;
Note: See TracBrowser for help on using the repository browser.