source: gsdl/trunk/perllib/plugins/CONTENTdmPlugin.pm@ 20790

Last change on this file since 20790 was 20790, checked in by kjdon, 11 years ago

set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)

  • Property svn:keywords set to Author Date Id Revision
File size: 21.8 KB
Line 
1###########################################################################
2#
3# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlugin;
26
27use ConvertBinaryFile;
28use ReadXMLFile;
29
30use unicode;
31use ghtml;
32
33use strict;
34no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37use XMLParser;
38
39# inherit ReadXMLFile for the apply_xslt method
40sub BEGIN {
41 @CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42}
43
44
45my $convert_to_list =
46 [ { 'name' => "auto",
47 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
48 { 'name' => "html",
49 'desc' => "{ConvertBinaryFile.convert_to.html}" },
50 { 'name' => "text",
51 'desc' => "{ConvertBinaryFile.convert_to.text}" },
52 { 'name' => "pagedimg",
53 'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
54 ];
55
56
57
58my $arguments =
59 [
60 { 'name' => "convert_to",
61 'desc' => "{ConvertBinaryFile.convert_to}",
62 'type' => "enum",
63 'reqd' => "yes",
64 'list' => $convert_to_list,
65 'deft' => "html" },
66 { 'name' => "xslt",
67 'desc' => "{ReadXMLFile.xslt}",
68 'type' => "string",
69 'deft' => "",
70 'reqd' => "no" },
71 { 'name' => "process_exp",
72 'desc' => "{BasePlugin.process_exp}",
73 'type' => "regexp",
74 'deft' => &get_default_process_exp(),
75 'reqd' => "no" },
76 { 'name' => "block_exp",
77 'desc' => "{BasePlugin.block_exp}",
78 'type' => "regexp",
79 'deft' => &get_default_block_exp() }
80];
81
82my $options = { 'name' => "CONTENTdmPlugin",
83 'desc' => "{CONTENTdmPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 # CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
87 'srcreplaceable' => "no",
88 'args' => $arguments };
89
90sub new {
91 my ($class) = shift (@_);
92 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93 push(@$pluginlist, $class);
94
95 push(@$inputargs,"-title_sub");
96 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
97
98 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99 push(@{$hashArgOptLists->{"OptList"}},$options);
100
101 my @arg_array = @$inputargs;
102 my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
103
104 if ($self->{'info_only'}) {
105 # don't worry about any options etc
106 return bless $self, $class;
107 }
108
109 my $parser = new XML::Parser('Style' => 'Stream',
110 'Pkg' => 'ReadXMLFile',
111 'PluginObj' => $self,
112 'Handlers' => {'Char' => \&ReadXMLFile::Char,
113 'XMLDecl' => \&ReadXMLFile::XMLDecl,
114 'Entity' => \&ReadXMLFile::Entity,
115 'Doctype' => \&ReadXMLFile::Doctype,
116 'Default' => \&ReadXMLFile::Default
117 });
118 $self->{'parser'} = $parser;
119
120
121 $self->{'rdf_desc'} = undef;
122 $self->{'about_key'} = undef;
123 $self->{'metadata_name'} = undef;
124 $self->{'metadata_value'} = undef;
125
126 $self->{'convert_to'} = "PagedImage";
127 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
128
129 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
130 $secondary_plugin_options->{'PagedImagePlugin'} = [];
131 }
132 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
133 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134 push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
135 push(@$pagedimg_options, "-file_rename_method", "none");
136 push(@$pagedimg_options, "-processing_tmp_files");
137 $self = bless $self, $class;
138
139# ***** no longer needed!
140# # This needs to be done after blss, to $self passed to XML::Parser
141# # can correctly resolve the right call-back methods during XML parsing
142
143
144 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145 return $self;
146}
147
148sub get_default_process_exp {
149 my $self = shift (@_);
150
151 return q^(?i)\.rdf$^;
152}
153
154sub get_default_block_exp {
155 return q^(?i)\.(jpg|jpeg|gif)$^;
156}
157
158
159
160sub rdf_desc_to_id
161{
162 my $self = shift (@_);
163 my ($rdf_desc) = @_;
164
165 my $rdf_id = {};
166
167 # initialise any .cpd (=complex multi page) structures
168
169 foreach my $about_key (keys %{$rdf_desc}) {
170 if ($about_key =~ m/\.cpd$/) {
171 my $about = $rdf_desc->{$about_key};
172 my $id = $about->{'dc:identifier'};
173
174 if ($id =~ m/^\s*$/) {
175 # missing id, make one up based on about attribute
176
177 my ($tailname, $dirname, $suffix)
178 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
179
180 $id = "about:$tailname";
181 }
182
183 $rdf_id->{$id} = $about;
184 $rdf_id->{$id}->{'ex:filename'} = $about_key;
185 $rdf_id->{$id}->{'ex:type'} = "complex";
186 $rdf_id->{$id}->{'pages'} = [];
187 }
188
189 }
190
191 # now add in *non* .cpd items
192
193 foreach my $about_key (keys %{$rdf_desc}) {
194 if ($about_key !~ m/\.cpd$/) {
195 my $about = $rdf_desc->{$about_key};
196 my $id = $about->{'dc:identifier'};
197
198
199 if ($id =~ m/^\s*$/) {
200 # missing id, make one up based on about attribute
201
202 my ($tailname, $dirname, $suffix)
203 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
204
205 $id = "about:$tailname";
206 }
207
208 if (defined $rdf_id->{$id}) {
209 $about->{'ex:filename'} = $about_key;
210
211 # dealing with complex multi-page situation
212 # Add to existing structure
213
214 my $pages = $rdf_id->{$id}->{'pages'};
215 push(@$pages,$about)
216 }
217 else {
218 # New entry
219
220 $rdf_id->{$id} = $about;
221 $rdf_id->{$id}->{'ex:type'} = "simple";
222 $rdf_id->{$id}->{'ex:filename'} = $about_key;
223 }
224 }
225
226 }
227
228 return $rdf_id;
229}
230
231
232sub metadata_table_txt_file
233{
234 my $self = shift (@_);
235 my ($output_root,$page_num) = @_;
236
237 my $txt_filename = $output_root."_page_$page_num.txt";
238
239 my ($tailname, $dirname, $suffix)
240 = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
241
242 my $txt_file = "$tailname$suffix";
243
244 return $txt_file;
245}
246
247
248sub output_metadata_table
249{
250 my $self = shift (@_);
251 my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
252
253 my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
254
255 open(TOUT,">$txt_filename")
256 || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
257
258 print TOUT $page->{'MetadataTable'};
259 delete $page->{'MetadataTable'};
260
261 close (TOUT);
262}
263
264
265sub rdf_id_to_item_file
266{
267 my $self = shift (@_);
268 my ($rdf_id,$tmp_dirname,$output_root) = @_;
269
270 my $item_file_list = [];
271
272 foreach my $id (keys %{$rdf_id}) {
273
274 my $id_safe = $id;
275 $id_safe =~ s/ /-/g;
276
277 my $output_filename = $output_root."_$id_safe.item";
278 open(FOUT,">$output_filename")
279 || die "Unable to open $output_filename: $!\n";
280
281
282 print FOUT "<PagedDocument>\n";
283
284 my $rdf_doc = $rdf_id->{$id};
285 foreach my $metadata_name (keys %$rdf_doc) {
286
287
288 next if ($metadata_name eq "pages");
289
290 my $metadata_value = $rdf_doc->{$metadata_name};
291
292 # convert ns:name to ns.Name
293 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
294
295 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
296 }
297
298 if ($rdf_doc->{'ex:type'} eq "complex") {
299 my $pages = $rdf_doc->{'pages'};
300 my $page_num = 1;
301
302 foreach my $page (@$pages) {
303
304 my $imgfile = $page->{'ex:filename'};
305 if ($imgfile =~ m/(http|ftp):/) {
306 $imgfile = "empty.jpg";
307 }
308 else {
309 $imgfile = &util::filename_cat("..","import",$imgfile);
310 }
311
312 my $txt_file
313 = $self->metadata_table_txt_file($output_root,$page_num);
314
315 $self->output_metadata_table($page,$page_num,
316 $tmp_dirname,$txt_file);
317
318
319 print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
320
321 foreach my $metadata_name (keys %$page) {
322
323 my $metadata_value = $rdf_doc->{$metadata_name};
324 # convert ns:name to ns.Name
325 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
326
327 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
328 }
329
330
331 $page_num++;
332
333
334 print FOUT " </Page>\n";
335 }
336 }
337 else {
338 # simple
339 # duplicate top-level metadata for now plus image to bind to
340
341 my $imgfile = $rdf_doc->{'ex:filename'};
342 if ($imgfile =~ m/(http|ftp):/) {
343 $imgfile = "empty.jpg";
344 }
345 else {
346 $imgfile = &util::filename_cat("..","import",$imgfile);
347 }
348
349
350 my $txt_file = $self->metadata_table_txt_file($output_root,1);
351 $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
352
353 print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
354 foreach my $metadata_name (keys %$rdf_doc) {
355
356 my $metadata_value = $rdf_doc->{$metadata_name};
357
358 # convert ns:name to ns.Name
359 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
360
361 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
362 }
363 print FOUT " </Page>\n";
364
365 }
366
367 print FOUT "</PagedDocument>\n";
368 close(FOUT);
369
370 push(@$item_file_list,$output_filename);
371
372 }
373
374
375 return $item_file_list;
376}
377
378
379
380sub xml_area_convert_file
381{
382 my $self = shift (@_);
383 my ($input_filename, $tmp_dirname, $output_root) = @_;
384
385 eval {
386 # Build up hash table/tree of all records
387
388 my $xslt = $self->{'xslt'};
389 if (defined $xslt && ($xslt ne "")) {
390 # perform xslt
391 my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
392
393 open(TOUT,">/tmp/tout.xml")
394 || die "Unable to open /tmp/tout.xml: $!\n";
395 print TOUT $transformed_xml;
396 close(TOUT);
397
398
399 # feed transformed file (now in memory as string) into XML parser
400 $self->{'parser'}->parse($transformed_xml);
401 }
402 else {
403 $self->{'parser'}->parsefile($input_filename);
404 }
405 };
406
407 if ($@) {
408
409 # parsefile may either croak somewhere in XML::Parser (e.g. because
410 # the document is not well formed) or die somewhere in XMLPlug or a
411 # derived plugin (e.g. because we're attempting to process a
412 # document whose DOCTYPE is not meant for this plugin). For the
413 # first case we'll print a warning and continue, for the second
414 # we'll just continue quietly
415
416 print STDERR "**** Error is: $@\n";
417
418 my $file = $self->{'file'};
419
420 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
421 if (defined $msg) {
422 my $outhandle = $self->{'outhandle'};
423 my $plugin_name = ref ($self);
424 print $outhandle "$plugin_name failed to process $file ($msg)\n";
425 }
426
427 my $gli = $self->{'gli'};
428
429 # reset ourself for the next document
430 $self->{'section_level'}=0;
431 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
432 return ("fail",undef); # error during processing
433 }
434
435 my $rdf_desc = $self->{'rdf_desc'};
436
437# foreach my $about_key (keys %{$rdf_desc}) {
438# my $about = $rdf_desc->{$about_key};
439# foreach my $metadata_name (keys %{$about}) {
440#
441# my $metadata_value = $about->{$metadata_name};
442## print STDERR " $metadata_name: $metadata_value\n";
443# }
444# }
445
446
447 # Merge entries with same name
448
449
450 my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
451
452# foreach my $about_key (keys %{$merged_rdf_id}) {
453# my $about = $merged_rdf_id->{$about_key};
454# foreach my $metadata_name (keys %{$about}) {
455#
456# my $metadata_value = $about->{$metadata_name};
457## print STDERR " $metadata_name: $metadata_value\n";
458# }
459# }
460
461
462
463 my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
464 $output_root);
465
466 return ("item",$item_files);
467}
468
469
470# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
471# to CONTENTdm
472#
473# A better (i.e. in the future) solution would be to see if this can be
474# shifted into gsConvert.pl so there is no need to override the
475# default tmp_area_convert_file()
476
477
478sub tmp_area_convert_file {
479 my $self = shift (@_);
480 my ($output_ext, $input_filename, $textref) = @_;
481
482 # is textref ever used?!?
483
484 my $outhandle = $self->{'outhandle'};
485 my $convert_to = $self->{'convert_to'};
486 my $failhandle = $self->{'failhandle'};
487 my $convert_to_ext = $self->{'convert_to_ext'};
488
489 # softlink to collection tmp dir
490 my $tmp_dirname
491 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
492 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
493
494 # derive tmp filename from input filename
495 my ($tailname, $dirname, $suffix)
496 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
497
498 # Remove any white space from filename -- no risk of name collision, and
499 # makes later conversion by utils simpler. Leave spaces in path...
500 # tidy up the filename with space, dot, hyphen between
501 $tailname =~ s/\s+//g;
502 $tailname =~ s/\.+//g;
503 $tailname =~ s/\-+//g;
504
505 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
506 $suffix = lc($suffix);
507 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
508
509 &util::soft_link($input_filename, $tmp_filename);
510 my $verbosity = $self->{'verbosity'};
511 if ($verbosity > 0) {
512 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
513 }
514
515 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
516
517 # call xml_area_convert_file rather than gsConvert.pl
518
519 my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
520
521 my ($output_type,$item_files)
522 = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
523
524
525 my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
526 my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
527
528 print STDERR "***** No source image identified with item\n";
529
530 print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
531
532 &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
533
534 # continue as before ...
535
536 # remove symbolic link to original file
537 &util::rm($tmp_filename);
538
539 # Check STDERR here
540 chomp $output_type;
541 if ($output_type eq "fail") {
542 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
543 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
544 $self->{'num_not_processed'} ++;
545 if (-s "$errlog") {
546 open(ERRLOG, "$errlog");
547 while (<ERRLOG>) {
548 print $outhandle "$_";
549 }
550 print $outhandle "\n";
551 close ERRLOG;
552 }
553 &util::rm("$errlog") if (-e "$errlog");
554 return [];
555 }
556
557 # store the *actual* output type and return the output filename
558 # it's possible we requested conversion to html, but only to text succeeded
559 #$self->{'convert_to_ext'} = $output_type;
560 if ($output_type =~ /html/i) {
561 $self->{'converted_to'} = "HTML";
562 } elsif ($output_type =~ /te?xt/i) {
563 $self->{'converted_to'} = "Text";
564 } elsif ($output_type =~ /item/i){
565 $self->{'converted_to'} = "PagedImage";
566 }
567
568
569 return $item_files;
570}
571
572
573
574
575# Override ConvertBinaryFile read
576# Needed so multiple .item files generated are sent down secondary plugin
577
578sub read {
579 my $self = shift (@_);
580 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
581
582
583 $self->{'gli'} = $gli;
584 $self->{'file'} = $file;
585
586 my $successful_rv = -1;
587
588 my $outhandle = $self->{'outhandle'};
589
590 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
591 return undef unless $self->can_process_this_file($filename_full_path);
592
593 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
594
595 # read() deviates at this point from ConvertBinaryFile
596 # Need to work with list of filename returned
597
598 my $output_ext = $self->{'convert_to_ext'};
599 my $conv_filename_list = [];
600
601 $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
602
603 if (scalar(@$conv_filename_list)==0) {
604 return -1;
605 } # had an error, will be passed down pipeline
606
607 foreach my $conv_filename ( @$conv_filename_list ) {
608 if (! -e "$conv_filename") {return -1;}
609 $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
610 $self->convert_post_process($conv_filename);
611
612 my $secondary_plugins = $self->{'secondary_plugins'};
613 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
614
615 if ($num_secondary_plugins == 0) {
616 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
617 return 0; # effectively block it
618 }
619
620 my @plugin_names = keys %$secondary_plugins;
621 my $plugin_name = shift @plugin_names;
622
623 if ($num_secondary_plugins > 1) {
624 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
625 }
626
627 my $secondary_plugin = $secondary_plugins->{$plugin_name};
628
629 # note: metadata is not carried on to the next level
630 my ($rv,$doc_obj)
631 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
632 $block_hash, $metadata, $processor, $maxdocs, $total_count,
633 $gli);
634
635 print STDERR "**** $conv_filename => returned rv = $rv\n";
636
637 if ((defined $rv) && ($rv>=0)) {
638 $successful_rv = 1;
639 }
640
641 # Override previous gsdlsourcefilename set by secondary plugin
642 my $collect_file = &util::filename_within_collection($filename_full_path);
643 my $collect_conv_file = &util::filename_within_collection($conv_filename);
644 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
645 $doc_obj->set_converted_filename($collect_conv_file);
646
647 my ($filemeta) = $file =~ /([^\\\/]+)$/;
648 $self->set_Source_metadata($doc_obj, $filemeta);
649 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
650 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
651
652 if ($self->{'cover_image'}) {
653 $self->associate_cover_image($doc_obj, $filename_full_path);
654 }
655
656 # do plugin specific processing of doc_obj
657 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
658 print STDERR "***** process returned undef: $base_dir $file\n";
659 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
660 return -1;
661 }
662 # do any automatic metadata extraction
663 $self->auto_extract_metadata ($doc_obj);
664
665 # have we found a Title??
666 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
667
668 # add an OID
669 $self->add_OID($doc_obj);
670 # process the document
671 $processor->process($doc_obj);
672
673 $self->{'num_processed'} ++;
674 }
675
676 return $successful_rv;
677}
678
679sub process {
680
681 return 1;
682}
683
684# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
685sub process_old {
686 my $self = shift (@_);
687 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
688
689
690 my $secondary_plugins = $self->{'secondary_plugins'};
691 my @plugin_names = keys %$secondary_plugins;
692 my $plugin_name = shift @plugin_names; # already checked there is only one
693
694 my $secondary_plugin = $secondary_plugins->{$plugin_name};
695
696 my $result = $secondary_plugin->process(@_);
697
698 return $result;
699}
700
701
702# Called at the beginning of the XML document.
703sub xml_start_document {
704 my $self = shift(@_);
705 my ($expat) = @_;
706
707 $self->{'rdf_desc'} = {};
708}
709
710
711# Called for DOCTYPE declarations - use die to bail out if this doctype
712# is not meant for this plugin
713sub xml_doctype {
714 my $self = shift(@_);
715 my ($expat, $name, $sysid, $pubid, $internal) = @_;
716
717 die "" if ($name !~ /^rdf:RDF$/);
718
719 my $outhandle = $self->{'outhandle'};
720 print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
721
722}
723
724# Called for every start tag. The $_ variable will contain a copy of the
725# tag and the %_ variable will contain the element's attributes.
726sub xml_start_tag {
727 my $self = shift(@_);
728 my ($expat, $element) = @_;
729
730 if ($element eq "rdf:Description") {
731
732 my $about_key = $_{'about'};
733
734 my $rdf_desc = $self->{'rdf_desc'};
735 $rdf_desc->{$about_key} = {};
736
737 $self->{'about_key'} = $about_key;
738 $self->{'index_text'} = "";
739 $self->{'pp_text'} = "<table width=\"100%\">\n";
740
741
742 }
743 elsif (defined $self->{'about_key'}) {
744 $self->{'metadata_name'} = $element;
745 $self->{'metadata_value'} = "";
746 }
747
748}
749
750# Called for every end tag. The $_ variable will contain a copy of the tag.
751sub xml_end_tag {
752 my $self = shift(@_);
753 my ($expat, $element) = @_;
754
755 if ($element eq "rdf:Description") {
756 $self->{'pp_text'} .= "</table>\n";
757 ## ghtml::htmlsafe($self->{'pp_text'});
758
759
760 my $about_key = $self->{'about_key'};
761 my $about = $self->{'rdf_desc'}->{$about_key};
762 $about->{'IndexText'} = $self->{'index_text'};
763 $about->{'MetadataTable'} = $self->{'pp_text'};
764
765
766 $self->{'about_key'} = undef;
767 $self->{'index_text'} = undef;
768 $self->{'pp_text'} = undef;
769
770 }
771 elsif (defined $self->{'metadata_name'}) {
772 my $metadata_name = $self->{'metadata_name'};
773 if ($element eq $metadata_name) {
774 my $metadata_value = $self->{'metadata_value'};
775
776 my $about_key = $self->{'about_key'};
777 my $about = $self->{'rdf_desc'}->{$about_key};
778 $about->{$metadata_name} = $metadata_value;
779
780 $self->{'index_text'} .= "$metadata_value\n";
781 $self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
782
783 $self->{'metadata_name'} = undef;
784 $self->{'metadata_value'} = undef;
785 }
786 }
787}
788
789# Called just before start or end tags with accumulated non-markup text in
790# the $_ variable.
791sub xml_text {
792 my $self = shift(@_);
793 my ($expat) = @_;
794
795 if (defined $self->{'metadata_name'}) {
796 $self->{'metadata_value'} .= $_;
797 }
798}
799
800# Called at the end of the XML document.
801sub xml_end_document {
802 my $self = shift(@_);
803 my ($expat) = @_;
804}
805
806
8071;
Note: See TracBrowser for help on using the repository browser.