source: trunk/gsdl/perllib/plugins/BNContentePlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1###########################################################################
2#
3# BNContentePlug.pm -- plugin for import the BN-Portugal Collection
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# BNContentePlug - 11/2004
27#
28#
29# This plugin takes "mets.xml" and "record/NCB_***.xml: the file contain MARC details
30# about BN-Portugal ccllection. The intension is to import such a collection into GS2.
31
32package BNContentePlug;
33
34use BasPlug;
35use plugin;
36#use ghtml;
37use XMLParser;
38use XML::Parser;
39
40sub BEGIN {
41 @ISA = ('BasPlug');
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
43}
44
45my $arguments =
46 [ { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "string",
49 'deft' => &get_default_process_exp(),
50 'reqd' => "no" },
51 { 'name' => "only_first_doc",
52 'desc' => "{BNContentePlug.only_first_doc}",
53 'type' => "flag",
54 'reqd' => "no" },
55 { 'name' => "first_inoder_ext",
56 'desc' => "{BNContentePlug.first_inorder_ext}",
57 'type' => "flag",
58 'reqd' => "no" },
59 { 'name' => "first_inorder_mime",
60 'desc' => "{BNContentePlug.first_inorder_mime}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "block_exp",
64 'desc' => "{BasPlug.block_exp}",
65 'type' => "string",
66 'deft' => &get_default_block_exp(),
67 'reqd' => "no" }];
68
69my $options = { 'name' => "BNContentePlug",
70 'desc' => "{BNContentePlug.desc}",
71 'inherits' => "yes",
72 'args' => $arguments };
73
74# Important variation to regular plugin structure. Need to desclare
75# $self as global variable to file so XMLParser callback routines
76# can access the content of the object.
77my ($self);
78
79sub get_default_process_exp {
80 my $self = shift (@_);
81
82 return q^(?i)(metsHTML\.xml)$^;
83}
84
85# block files
86sub get_default_block_exp {
87 my $self = shift (@_);
88
89 # Block all files besides contents
90 #return q^(?i)(metsHTML\.xml|)$^;
91 return q^(?i)((.*?)\.(.*?))$^;
92}
93
94sub new {
95 my $class = shift (@_);
96 #my $plugin_name = shift (@_);
97
98 $self = new BasPlug ($class, @_);
99 $self->{'plugin_type'} = "BNContentePlug";
100
101 my $option_list = $self->{'option_list'};
102 push( @{$option_list}, $options );
103
104 if (!parsargv::parse(\@_,
105 "allow_extra_options")) {
106 print STDERR "\nBNContentePlug uses an incorrect option.\n";
107 print STDERR "Check your collect.cfg configuration file.\n\n";
108 $self->print_txt_usage(""); # Use default resource bundle
109 die "\n";
110 }
111
112 #create XML::Parser object for parsing metsHTML.xml, NCB_???.xml files
113 my $mets_parser = new XML::Parser('Style' => 'Stream',
114 'Handlers' => {'Doctype' => \&METS_Doctype,
115 'Start' => \&METS_StartTag,
116 'End' => \&METS_EndTag
117 });
118
119 my $marc_parser = new XML::Parser('Style' => 'Stream',
120 'Handlers' => {'Char' => \&Char,
121 'Doctype' => \&MARC_Doctype,
122 'Start' => \&MARC_StartTag,
123 'End' => \&MARC_EndTag
124 });
125 $self->{'mets_parser'} = $mets_parser;
126 $self->{'marc_parser'} = $marc_parser;
127
128 $self->{'index_file'} = "";
129
130 return bless $self, $class;
131}
132
133sub read_marc_content {
134 my $self = shift (@_);
135 my ($marc_file) = @_;
136
137 # parse the Marc_file: NCB_???.xml
138 eval{
139 $self->{'marc_parser'}->parsefile($marc_file);
140 };
141
142 if ($@) {
143 die "BNContentePlug: ERROR $marc_file is not a well formed XML file ($@)\n";
144 }
145}
146
147# Read metsHTML.xml from BN-Portugal collection
148sub metadata_read {
149 my $self = shift (@_);
150 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
151
152 my $outhandle = $self->{'outhandle'};
153
154 my $filename = &util::filename_cat($base_dir, $file);
155
156 if ($filename !~ /metsHTML\.xml$/ || !-f $filename) {
157 if ($filename =~ /\.xml$/i || $filename =~ /log\.txt$/i || $filename =~ /isbd\.html$/i) {
158 $self->{'file_blocks'}->{$filename}=1;
159 }
160 return undef;
161 } else {
162 $self->{'file_blocks'}->{$filename}=1;
163 }
164
165 print $outhandle "BNContentePlug: extracting metadata from $filename\n"
166 if $self->{'verbosity'} > 1;
167
168 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
169 $self->{'dir'} = $dir;
170
171 eval {
172 $self->{'mets_parser'}->parsefile($filename);
173 };
174
175 if ($@) {
176 die "BNContentePlug: ERROR $filename is not a well formed XML file ($@)\n";
177 }
178
179 # read NCB_???.xml to parse MARC records and save as metadata
180 my $marc_file = &util::filename_cat($dir,$self->{'marc_file'});
181 $self->read_marc_content ($marc_file);
182
183 if (defined $self->{'index_file'} && $self->{'index_file'} ne "") {
184 my $index_file = $self->{'index_file'};
185 push(@$extrametakeys,$index_file);
186 $extrametadata->{$index_file} = $self->{'saved_metadata'};
187 } else {
188 print STDERR "####Warning can't find main index file\n";
189 }
190 return 1;
191}
192
193# The BNContentePlug read() function. This function does all the right things
194# to make general options work for a given plugin. It calls the process()
195# function which does all the work specific to a plugin (like the old
196# read functions used to do). Most plugins should define their own
197# process() function and let this read() function keep control.
198#
199# Return number of files processed, undef if can't process
200# Note that $base_dir might be "" and that $file might
201# include directories
202
203sub readxxx {
204 my $self = shift (@_);
205 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
206 my $outhandle = $self->{'outhandle'};
207
208 my $filename = &util::filename_cat($base_dir, $file);
209 #return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
210
211 #return 0 if ($filename =~ /metsHTML\.xml$/);
212 return 0 if ($filename =~ /\.xml$/);
213 return 0 if (defined $self->{'file_blocks'}->{'filename'});
214
215 return undef;
216}
217
218# do plugin specific processing of doc_obj
219sub process {
220 my $self = shift (@_);
221 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
222 my $outhandle = $self->{'outhandle'};
223
224 return 1;
225}
226
227sub METS_Doctype {
228 my ($expat, $name, $sysid, $pubid, $internal) = @_;
229 die if ($name !~ /^metsHTML\.xml$/);
230}
231
232sub MARC_Doctype {
233 my ($expat, $name, $sysid, $pubid, $internal) = @_;
234 print STDERR "###MARC Name =$name\n";
235 #die if ($name !~ /^metsHTML\.xml$/);
236 #die if (!$name);
237}
238
239sub METS_StartTag {
240 my ($expat, $element, %attr) = @_;
241 my @file_blocks;
242 if ($element eq "dmdSec") {
243 $self->{'marc_file'} = "";
244 } elsif ($element eq "mdRef") {
245 my $marc_href = $attr{'xlink:href'};
246 $self->{'marc_file'} = $marc_href;
247 } elsif ($element eq "FLocat"){
248 my $assocfiles = $attr{'xlink:href'};
249 if ($assocfiles =~ /index\.html$/) {
250 my $index_file = &util::filename_cat($self->{'dir'}, $assocfiles);
251 $self->{'index_file'} = $index_file;
252 } else {
253 my $link = &util::filename_cat($self->{'dir'}, $assocfiles);
254 #$self->{'file_blocks'}->{$link} = 1;
255 }
256 }
257}
258
259sub METS_EndTag {
260 my ($expat, $element, %attr) = @_;
261}
262
263sub MARC_StartTag {
264 my ($expat, $element, %attr) = @_;
265
266 if ($element eq "record") {
267 $self->{'saved_metadata'} = {};
268 }
269 elsif ($element eq "datafield") {
270 $self->{'metaname'} = $element;
271 $self->{'datafield'} = $attr{'tag'};
272 }
273 elsif ($element eq "subfield") {
274 $self->{'subfield'} = $attr{'code'};
275 $self->{'text'} = "";
276 }
277}
278
279sub MARC_EndTag {
280 my ($expat, $element) = @_;
281
282 if ($element eq "datafield") {
283 $self->{'metaname'} = "";
284 }
285 elsif ($element eq "subfield") {
286 my $mvalue = $self->{'text'};
287 my $mname = $self->{'datafield'}."^".$self->{'subfield'};
288 #print STDERR "**** $mname = $mvalue\n";
289
290 $mvalue =~ s/\[/&\#91;/g;
291 $mvalue =~ s/\[/&\#93;/g;
292
293
294 if (defined $self->{'saved_metadata'}->{$mname}) {
295 # accumulate - add value to existing value(s)
296 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
297 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
298 } else {
299 $self->{'saved_metadata'}->{$mname} =
300 [$self->{'saved_metadata'}->{$mname}, $mvalue];
301 }
302 } else {
303 # accumulate - add value into (currently empty) array
304 $self->{'saved_metadata'}->{$mname} = [$mvalue];
305 }
306 # store something here
307 $self->{'subfield'} = "";
308 $self->{'text'} = "";
309 }
310
311}
312
313
314# This Char function overrides the one in XML::Parser::Stream to overcome a
315# problem where $expat->{Text} is treated as the return value, slowing
316# things down significantly in some cases.
317sub Char {
318 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
319 $_[0]->{'Text'} .= $_[1];
320 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
321 $self->{'text'} .= $_[1];
322 }
323 return undef;
324}
325
3261;
Note: See TracBrowser for help on using the repository browser.