source: gsdl/trunk/perllib/plugins/CONTENTdmPlugin.pm@ 19811

Last change on this file since 19811 was 19052, checked in by davidb, 15 years ago

Modifications to CONENTdmPlugin so its works with Kathy's new plugin restructuring. Still considered experimental.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.8 KB
Line 
1###########################################################################
2#
3# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlugin;
26
27use ConvertBinaryFile;
28use ReadXMLFile;
29
30use unicode;
31use ghtml;
32
33use strict;
34no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37use XMLParser;
38
39# inherit ReadXMLFile for the apply_xslt method
40sub BEGIN {
41 @CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42}
43
44
45my $convert_to_list =
46 [ { 'name' => "auto",
47 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
48 { 'name' => "html",
49 'desc' => "{ConvertBinaryFile.convert_to.html}" },
50 { 'name' => "text",
51 'desc' => "{ConvertBinaryFile.convert_to.text}" },
52 { 'name' => "pagedimg",
53 'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
54 ];
55
56
57
58my $arguments =
59 [
60 { 'name' => "convert_to",
61 'desc' => "{ConvertBinaryFile.convert_to}",
62 'type' => "enum",
63 'reqd' => "yes",
64 'list' => $convert_to_list,
65 'deft' => "html" },
66 { 'name' => "xslt",
67 'desc' => "{ReadXMLFile.xslt}",
68 'type' => "string",
69 'deft' => "",
70 'reqd' => "no" },
71 { 'name' => "process_exp",
72 'desc' => "{BasePlugin.process_exp}",
73 'type' => "regexp",
74 'deft' => &get_default_process_exp(),
75 'reqd' => "no" },
76 { 'name' => "block_exp",
77 'desc' => "{BasePlugin.block_exp}",
78 'type' => "regexp",
79 'deft' => &get_default_block_exp() }
80];
81
82my $options = { 'name' => "CONTENTdmPlugin",
83 'desc' => "{CONTENTdmPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 # CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
87 'srcreplaceable' => "no",
88 'args' => $arguments };
89
90sub new {
91 my ($class) = shift (@_);
92 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93 push(@$pluginlist, $class);
94
95 push(@$inputargs,"-title_sub");
96 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
97
98 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99 push(@{$hashArgOptLists->{"OptList"}},$options);
100
101 my @arg_array = @$inputargs;
102 my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
103
104 if ($self->{'info_only'}) {
105 # don't worry about any options etc
106 return bless $self, $class;
107 }
108
109 my $parser = new XML::Parser('Style' => 'Stream',
110 'Pkg' => 'ReadXMLFile',
111 'PluginObj' => $self,
112 'Handlers' => {'Char' => \&ReadXMLFile::Char,
113 'XMLDecl' => \&ReadXMLFile::XMLDecl,
114 'Entity' => \&ReadXMLFile::Entity,
115 'Doctype' => \&ReadXMLFile::Doctype,
116 'Default' => \&ReadXMLFile::Default
117 });
118 $self->{'parser'} = $parser;
119
120
121 $self->{'rdf_desc'} = undef;
122 $self->{'about_key'} = undef;
123 $self->{'metadata_name'} = undef;
124 $self->{'metadata_value'} = undef;
125
126 $self->{'convert_to'} = "PagedImage";
127 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
128
129 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
130 $secondary_plugin_options->{'PagedImagePlugin'} = [];
131 }
132 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
133 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134 push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
135 push(@$pagedimg_options, "-file_rename_method", "none");
136
137 $self = bless $self, $class;
138
139# ***** no longer needed!
140# # This needs to be done after blss, to $self passed to XML::Parser
141# # can correctly resolve the right call-back methods during XML parsing
142
143
144 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145 return $self;
146}
147
148sub get_default_process_exp {
149 my $self = shift (@_);
150
151 return q^(?i)\.rdf$^;
152}
153
154sub get_default_block_exp {
155 return q^(?i)\.(jpg|jpeg|gif)$^;
156}
157
158
159
160sub rdf_desc_to_id
161{
162 my $self = shift (@_);
163 my ($rdf_desc) = @_;
164
165 my $rdf_id = {};
166
167 # initialise any .cpd (=complex multi page) structures
168
169 foreach my $about_key (keys %{$rdf_desc}) {
170 if ($about_key =~ m/\.cpd$/) {
171 my $about = $rdf_desc->{$about_key};
172 my $id = $about->{'dc:identifier'};
173
174 if ($id =~ m/^\s*$/) {
175 # missing id, make one up based on about attribute
176
177 my ($tailname, $dirname, $suffix)
178 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
179
180 $id = "about:$tailname";
181 }
182
183 $rdf_id->{$id} = $about;
184 $rdf_id->{$id}->{'ex:filename'} = $about_key;
185 $rdf_id->{$id}->{'ex:type'} = "complex";
186 $rdf_id->{$id}->{'pages'} = [];
187 }
188
189 }
190
191 # now add in *non* .cpd items
192
193 foreach my $about_key (keys %{$rdf_desc}) {
194 if ($about_key !~ m/\.cpd$/) {
195 my $about = $rdf_desc->{$about_key};
196 my $id = $about->{'dc:identifier'};
197
198
199 if ($id =~ m/^\s*$/) {
200 # missing id, make one up based on about attribute
201
202 my ($tailname, $dirname, $suffix)
203 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
204
205 $id = "about:$tailname";
206 }
207
208 if (defined $rdf_id->{$id}) {
209 $about->{'ex:filename'} = $about_key;
210
211 # dealing with complex multi-page situation
212 # Add to existing structure
213
214 my $pages = $rdf_id->{$id}->{'pages'};
215 push(@$pages,$about)
216 }
217 else {
218 # New entry
219
220 $rdf_id->{$id} = $about;
221 $rdf_id->{$id}->{'ex:type'} = "simple";
222 $rdf_id->{$id}->{'ex:filename'} = $about_key;
223 }
224 }
225
226 }
227
228 return $rdf_id;
229}
230
231
232sub metadata_table_txt_file
233{
234 my $self = shift (@_);
235 my ($output_root,$page_num) = @_;
236
237 my $txt_filename = $output_root."_page_$page_num.txt";
238
239 my ($tailname, $dirname, $suffix)
240 = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
241
242 my $txt_file = "$tailname$suffix";
243
244 return $txt_file;
245}
246
247
248sub output_metadata_table
249{
250 my $self = shift (@_);
251 my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
252
253 my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
254
255 open(TOUT,">$txt_filename")
256 || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
257
258 print TOUT $page->{'MetadataTable'};
259 delete $page->{'MetadataTable'};
260
261 close (TOUT);
262}
263
264
265sub rdf_id_to_item_file
266{
267 my $self = shift (@_);
268 my ($rdf_id,$tmp_dirname,$output_root) = @_;
269
270 my $item_file_list = [];
271
272 foreach my $id (keys %{$rdf_id}) {
273
274 my $id_safe = $id;
275 $id_safe =~ s/ /-/g;
276
277 my $output_filename = $output_root."_$id_safe.item";
278 open(FOUT,">$output_filename")
279 || die "Unable to open $output_filename: $!\n";
280
281
282 print FOUT "<PagedDocument>\n";
283
284 my $rdf_doc = $rdf_id->{$id};
285 foreach my $metadata_name (keys %$rdf_doc) {
286
287
288 next if ($metadata_name eq "pages");
289
290 my $metadata_value = $rdf_doc->{$metadata_name};
291
292 # convert ns:name to ns.Name
293 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
294
295 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
296 }
297
298 if ($rdf_doc->{'ex:type'} eq "complex") {
299 my $pages = $rdf_doc->{'pages'};
300 my $page_num = 1;
301
302 foreach my $page (@$pages) {
303
304 my $imgfile = $page->{'ex:filename'};
305 if ($imgfile =~ m/(http|ftp):/) {
306 $imgfile = "empty.jpg";
307 }
308 else {
309 $imgfile = &util::filename_cat("..","import",$imgfile);
310 }
311
312 my $txt_file
313 = $self->metadata_table_txt_file($output_root,$page_num);
314
315 $self->output_metadata_table($page,$page_num,
316 $tmp_dirname,$txt_file);
317
318
319 print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
320
321 foreach my $metadata_name (keys %$page) {
322
323 my $metadata_value = $rdf_doc->{$metadata_name};
324 # convert ns:name to ns.Name
325 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
326
327 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
328 }
329
330
331 $page_num++;
332
333
334 print FOUT " </Page>\n";
335 }
336 }
337 else {
338 # simple
339 # duplicate top-level metadata for now plus image to bind to
340
341 my $imgfile = $rdf_doc->{'ex:filename'};
342 if ($imgfile =~ m/(http|ftp):/) {
343 $imgfile = "empty.jpg";
344 }
345 else {
346 $imgfile = &util::filename_cat("..","import",$imgfile);
347 $imgfile = "../import/".$imgfile;
348 }
349
350
351 my $txt_file = $self->metadata_table_txt_file($output_root,1);
352 $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
353
354 print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
355 foreach my $metadata_name (keys %$rdf_doc) {
356
357 my $metadata_value = $rdf_doc->{$metadata_name};
358
359 # convert ns:name to ns.Name
360 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
361
362 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
363 }
364 print FOUT " </Page>\n";
365
366 }
367
368 print FOUT "</PagedDocument>\n";
369 close(FOUT);
370
371 push(@$item_file_list,$output_filename);
372
373 }
374
375
376 return $item_file_list;
377}
378
379
380
381sub xml_area_convert_file
382{
383 my $self = shift (@_);
384 my ($input_filename, $tmp_dirname, $output_root) = @_;
385
386 eval {
387 # Build up hash table/tree of all records
388
389 my $xslt = $self->{'xslt'};
390 if (defined $xslt && ($xslt ne "")) {
391 # perform xslt
392 my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
393
394 open(TOUT,">/tmp/tout.xml")
395 || die "Unable to open /tmp/tout.xml: $!\n";
396 print TOUT $transformed_xml;
397 close(TOUT);
398
399
400 # feed transformed file (now in memory as string) into XML parser
401 $self->{'parser'}->parse($transformed_xml);
402 }
403 else {
404 $self->{'parser'}->parsefile($input_filename);
405 }
406 };
407
408 if ($@) {
409
410 # parsefile may either croak somewhere in XML::Parser (e.g. because
411 # the document is not well formed) or die somewhere in XMLPlug or a
412 # derived plugin (e.g. because we're attempting to process a
413 # document whose DOCTYPE is not meant for this plugin). For the
414 # first case we'll print a warning and continue, for the second
415 # we'll just continue quietly
416
417 print STDERR "**** Error is: $@\n";
418
419 my $file = $self->{'file'};
420
421 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
422 if (defined $msg) {
423 my $outhandle = $self->{'outhandle'};
424 my $plugin_name = ref ($self);
425 print $outhandle "$plugin_name failed to process $file ($msg)\n";
426 }
427
428 my $gli = $self->{'gli'};
429
430 # reset ourself for the next document
431 $self->{'section_level'}=0;
432 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
433 return ("fail",undef); # error during processing
434 }
435
436 my $rdf_desc = $self->{'rdf_desc'};
437
438# foreach my $about_key (keys %{$rdf_desc}) {
439# my $about = $rdf_desc->{$about_key};
440# foreach my $metadata_name (keys %{$about}) {
441#
442# my $metadata_value = $about->{$metadata_name};
443## print STDERR " $metadata_name: $metadata_value\n";
444# }
445# }
446
447
448 # Merge entries with same name
449
450
451 my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
452
453# foreach my $about_key (keys %{$merged_rdf_id}) {
454# my $about = $merged_rdf_id->{$about_key};
455# foreach my $metadata_name (keys %{$about}) {
456#
457# my $metadata_value = $about->{$metadata_name};
458## print STDERR " $metadata_name: $metadata_value\n";
459# }
460# }
461
462
463
464 my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
465 $output_root);
466
467 return ("item",$item_files);
468}
469
470
471# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
472# to CONTENTdm
473#
474# A better (i.e. in the future) solution would be to see if this can be
475# shifted into gsConvert.pl so there is no need to override the
476# default tmp_area_convert_file()
477
478
479sub tmp_area_convert_file {
480 my $self = shift (@_);
481 my ($output_ext, $input_filename, $textref) = @_;
482
483 # is textref ever used?!?
484
485 my $outhandle = $self->{'outhandle'};
486 my $convert_to = $self->{'convert_to'};
487 my $failhandle = $self->{'failhandle'};
488 my $convert_to_ext = $self->{'convert_to_ext'};
489
490 # softlink to collection tmp dir
491 my $tmp_dirname
492 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
493 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
494
495 # derive tmp filename from input filename
496 my ($tailname, $dirname, $suffix)
497 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
498
499 # Remove any white space from filename -- no risk of name collision, and
500 # makes later conversion by utils simpler. Leave spaces in path...
501 # tidy up the filename with space, dot, hyphen between
502 $tailname =~ s/\s+//g;
503 $tailname =~ s/\.+//g;
504 $tailname =~ s/\-+//g;
505
506 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
507 $suffix = lc($suffix);
508 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
509
510 &util::soft_link($input_filename, $tmp_filename);
511 my $verbosity = $self->{'verbosity'};
512 if ($verbosity > 0) {
513 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
514 }
515
516 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
517
518 # call xml_area_convert_file rather than gsConvert.pl
519
520 my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
521
522 my ($output_type,$item_files)
523 = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
524
525
526 my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
527 my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
528
529 print STDERR "***** No source image identified with item\n";
530
531 print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
532
533 &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
534
535 # continue as before ...
536
537 # remove symbolic link to original file
538 &util::rm($tmp_filename);
539
540 # Check STDERR here
541 chomp $output_type;
542 if ($output_type eq "fail") {
543 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
544 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
545 $self->{'num_not_processed'} ++;
546 if (-s "$errlog") {
547 open(ERRLOG, "$errlog");
548 while (<ERRLOG>) {
549 print $outhandle "$_";
550 }
551 print $outhandle "\n";
552 close ERRLOG;
553 }
554 &util::rm("$errlog") if (-e "$errlog");
555 return [];
556 }
557
558 # store the *actual* output type and return the output filename
559 # it's possible we requested conversion to html, but only to text succeeded
560 #$self->{'convert_to_ext'} = $output_type;
561 if ($output_type =~ /html/i) {
562 $self->{'converted_to'} = "HTML";
563 } elsif ($output_type =~ /te?xt/i) {
564 $self->{'converted_to'} = "Text";
565 } elsif ($output_type =~ /item/i){
566 $self->{'converted_to'} = "PagedImage";
567 }
568
569
570 return $item_files;
571}
572
573
574
575
576# Override ConvertBinaryFile read
577# Needed so multiple .item files generated are sent down secondary plugin
578
579sub read {
580 my $self = shift (@_);
581 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
582
583
584 $self->{'gli'} = $gli;
585 $self->{'file'} = $file;
586
587 my $successful_rv = -1;
588
589 my $outhandle = $self->{'outhandle'};
590
591 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
592 return undef unless $self->can_process_this_file($filename_full_path);
593
594 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
595
596 # read() deviates at this point from ConvertBinaryFile
597 # Need to work with list of filename returned
598
599 my $output_ext = $self->{'convert_to_ext'};
600 my $conv_filename_list = [];
601
602 $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
603
604 if (scalar(@$conv_filename_list)==0) {
605 return -1;
606 } # had an error, will be passed down pipeline
607
608 foreach my $conv_filename ( @$conv_filename_list ) {
609 if (! -e "$conv_filename") {return -1;}
610 $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
611 $self->convert_post_process($conv_filename);
612
613 my $secondary_plugins = $self->{'secondary_plugins'};
614 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
615
616 if ($num_secondary_plugins == 0) {
617 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
618 return 0; # effectively block it
619 }
620
621 my @plugin_names = keys %$secondary_plugins;
622 my $plugin_name = shift @plugin_names;
623
624 if ($num_secondary_plugins > 1) {
625 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
626 }
627
628 my $secondary_plugin = $secondary_plugins->{$plugin_name};
629
630 # note: metadata is not carried on to the next level
631 my ($rv,$doc_obj)
632 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
633 $block_hash, $metadata, $processor, $maxdocs, $total_count,
634 $gli);
635
636 print STDERR "**** $conv_filename => returned rv = $rv\n";
637
638 if ((defined $rv) && ($rv>=0)) {
639 $successful_rv = 1;
640 }
641
642 # Override previous gsdlsourcefilename set by secondary plugin
643 my $collect_file = &util::filename_within_collection($filename_full_path);
644 my $collect_conv_file = &util::filename_within_collection($conv_filename);
645 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
646 $doc_obj->set_converted_filename($collect_conv_file);
647
648 my ($filemeta) = $file =~ /([^\\\/]+)$/;
649 $self->set_Source_metadata($doc_obj, $filemeta);
650 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
651 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
652
653 if ($self->{'cover_image'}) {
654 $self->associate_cover_image($doc_obj, $filename_full_path);
655 }
656
657 # do plugin specific processing of doc_obj
658 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
659 print STDERR "***** process returned undef: $base_dir $file\n";
660 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
661 return -1;
662 }
663 # do any automatic metadata extraction
664 $self->auto_extract_metadata ($doc_obj);
665
666 # have we found a Title??
667 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
668
669 # add an OID
670 $self->add_OID($doc_obj);
671 # process the document
672 $processor->process($doc_obj);
673
674 $self->{'num_processed'} ++;
675 }
676
677 return $successful_rv;
678}
679
680sub process {
681
682 return 1;
683}
684
685# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
686sub process_old {
687 my $self = shift (@_);
688 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
689
690
691 my $secondary_plugins = $self->{'secondary_plugins'};
692 my @plugin_names = keys %$secondary_plugins;
693 my $plugin_name = shift @plugin_names; # already checked there is only one
694
695 my $secondary_plugin = $secondary_plugins->{$plugin_name};
696
697 my $result = $secondary_plugin->process(@_);
698
699 return $result;
700}
701
702
703# Called at the beginning of the XML document.
704sub xml_start_document {
705 my $self = shift(@_);
706 my ($expat) = @_;
707
708 $self->{'rdf_desc'} = {};
709}
710
711
712# Called for DOCTYPE declarations - use die to bail out if this doctype
713# is not meant for this plugin
714sub xml_doctype {
715 my $self = shift(@_);
716 my ($expat, $name, $sysid, $pubid, $internal) = @_;
717
718 die "" if ($name !~ /^rdf:RDF$/);
719
720 my $outhandle = $self->{'outhandle'};
721 print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
722
723}
724
725# Called for every start tag. The $_ variable will contain a copy of the
726# tag and the %_ variable will contain the element's attributes.
727sub xml_start_tag {
728 my $self = shift(@_);
729 my ($expat, $element) = @_;
730
731 if ($element eq "rdf:Description") {
732
733 my $about_key = $_{'about'};
734
735 my $rdf_desc = $self->{'rdf_desc'};
736 $rdf_desc->{$about_key} = {};
737
738 $self->{'about_key'} = $about_key;
739 $self->{'index_text'} = "";
740 $self->{'pp_text'} = "<table width=\"100%\">\n";
741
742
743 }
744 elsif (defined $self->{'about_key'}) {
745 $self->{'metadata_name'} = $element;
746 $self->{'metadata_value'} = "";
747 }
748
749}
750
751# Called for every end tag. The $_ variable will contain a copy of the tag.
752sub xml_end_tag {
753 my $self = shift(@_);
754 my ($expat, $element) = @_;
755
756 if ($element eq "rdf:Description") {
757 $self->{'pp_text'} .= "</table>\n";
758 ## ghtml::htmlsafe($self->{'pp_text'});
759
760
761 my $about_key = $self->{'about_key'};
762 my $about = $self->{'rdf_desc'}->{$about_key};
763 $about->{'IndexText'} = $self->{'index_text'};
764 $about->{'MetadataTable'} = $self->{'pp_text'};
765
766
767 $self->{'about_key'} = undef;
768 $self->{'index_text'} = undef;
769 $self->{'pp_text'} = undef;
770
771 }
772 elsif (defined $self->{'metadata_name'}) {
773 my $metadata_name = $self->{'metadata_name'};
774 if ($element eq $metadata_name) {
775 my $metadata_value = $self->{'metadata_value'};
776
777 my $about_key = $self->{'about_key'};
778 my $about = $self->{'rdf_desc'}->{$about_key};
779 $about->{$metadata_name} = $metadata_value;
780
781 $self->{'index_text'} .= "$metadata_value\n";
782 $self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
783
784 $self->{'metadata_name'} = undef;
785 $self->{'metadata_value'} = undef;
786 }
787 }
788}
789
790# Called just before start or end tags with accumulated non-markup text in
791# the $_ variable.
792sub xml_text {
793 my $self = shift(@_);
794 my ($expat) = @_;
795
796 if (defined $self->{'metadata_name'}) {
797 $self->{'metadata_value'} .= $_;
798 }
799}
800
801# Called at the end of the XML document.
802sub xml_end_document {
803 my $self = shift(@_);
804 my ($expat) = @_;
805}
806
807
8081;
Note: See TracBrowser for help on using the repository browser.