source: documented-example-collections/trunk/gsarch-e/index/assoc/4601A2ED.dir/CONTENTdmPlug.pm@ 18738

Last change on this file since 18738 was 18738, checked in by oranfry, 15 years ago

the rest of the documented example collections

  • Property svn:executable set to *
File size: 23.4 KB
Line 
1###########################################################################
2#
3# CONTENTdmPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlug;
26
27use ConvertToPlug;
28use unicode;
29use ghtml;
30
31use strict;
32no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
33
34
35use XMLParser;
36
37sub BEGIN {
38 @CONTENTdmPlug::ISA = ('ConvertToPlug');
39}
40
41
42my $convert_to_list =
43 [ { 'name' => "auto",
44 'desc' => "{ConvertToPlug.convert_to.auto}" },
45 { 'name' => "html",
46 'desc' => "{ConvertToPlug.convert_to.html}" },
47 { 'name' => "text",
48 'desc' => "{ConvertToPlug.convert_to.text}" },
49 { 'name' => "pagedimg",
50 'desc' => "{ConvertToPlug.convert_to.pagedimg}"},
51 ];
52
53
54
55my $arguments =
56 [
57 { 'name' => "convert_to",
58 'desc' => "{ConvertToPlug.convert_to}",
59 'type' => "enum",
60 'reqd' => "yes",
61 'list' => $convert_to_list,
62 'deft' => "html" },
63 { 'name' => "xslt",
64 'desc' => "{XMLPlug.xslt}",
65 'type' => "string",
66 'deft' => "",
67 'reqd' => "no" },
68 { 'name' => "process_exp",
69 'desc' => "{BasPlug.process_exp}",
70 'type' => "regexp",
71 'deft' => &get_default_process_exp(),
72 'reqd' => "no" },
73 { 'name' => "block_exp",
74 'desc' => "{BasPlug.block_exp}",
75 'type' => "regexp",
76 'deft' => &get_default_block_exp() }
77];
78
79my $options = { 'name' => "CONTENTdmPlug",
80 'desc' => "{CONTENTdmPlug.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85our ($self);
86
87sub new {
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 push(@$inputargs,"-title_sub");
93 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
96 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
97
98 my @arg_array = @$inputargs;
99 $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
100
101 if ($self->{'info_only'}) {
102 # don't worry about any options etc
103 return bless $self, $class;
104 }
105
106 my $parser = new XML::Parser('Style' => 'Stream',
107 'Handlers' => {'Char' => \&Char,
108 'XMLDecl' => \&XMLDecl,
109 'Entity' => \&Entity,
110 'Doctype' => \&Doctype,
111 'Default' => \&Default
112 });
113 $self->{'parser'} = $parser;
114 $self->{'rdf_desc'} = undef;
115 $self->{'about_key'} = undef;
116 $self->{'metadata_name'} = undef;
117 $self->{'metadata_value'} = undef;
118
119 $self->{'convert_to'} = "PagedImg";
120 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
121
122 if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
123 $secondary_plugin_options->{'PagedImgPlug'} = [];
124 my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
125 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126 push(@$pagedimg_options, "-thumbnail", "-screenview");
127 }
128
129 $self = bless $self, $class;
130 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
131 return $self;
132}
133
134sub get_default_process_exp {
135 my $self = shift (@_);
136
137 return q^(?i)\.rdf$^;
138}
139
140# so we don't inherit HTMLPlug's block exp...
141sub get_default_block_exp {
142 return q^(?i)\.(jpg|jpeg|gif)$^;
143}
144
145
146
147
148# A smarter (?) option would be to add XMLPlug into inheritence above
149# thereby avoiding a fair amount of code duplication
150
151sub apply_xslt
152{
153 my $self = shift @_;
154 my ($xslt,$filename) = @_;
155
156 my $outhandle = $self->{'outhandle'};
157
158 my $xslt_filename = $xslt;
159
160 if (! -e $xslt_filename) {
161 # Look in main site directory
162 my $gsdlhome = $ENV{'GSDLHOME'};
163 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
164 }
165
166 if (! -e $xslt_filename) {
167 # Look in collection directory
168 my $coldir = $ENV{'GSDLCOLLECTDIR'};
169 $xslt_filename = &util::filename_cat($coldir,$xslt);
170 }
171
172 if (! -e $xslt_filename) {
173 print $outhandle "Warning: Unable to find XSLT $xslt\n";
174 if (open(XMLIN,"<$filename")) {
175
176 my $untransformed_xml = "";
177 while (defined (my $line = <XMLIN>)) {
178
179 $untransformed_xml .= $line;
180 }
181 close(XMLIN);
182
183 return $untransformed_xml;
184 }
185 else {
186 print $outhandle "Error: Unable to open file $filename\n";
187 print $outhandle " $!\n";
188 return "";
189 }
190
191 }
192
193 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
194 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
195 my $xslt_base_cmd = "java -jar $jar_filename";
196 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
197
198 my $transformed_xml = "";
199
200 if (open(XSLT_IN,"$xslt_cmd |")) {
201 while (defined (my $line = <XSLT_IN>)) {
202
203 $transformed_xml .= $line;
204 }
205 close(XSLT_IN);
206 }
207 else {
208 print $outhandle "Error: Unable to run command $xslt_cmd\n";
209 print $outhandle " $!\n";
210 }
211
212 return $transformed_xml;
213
214}
215
216
217
218sub rdf_desc_to_id
219{
220 my $self = shift (@_);
221 my ($rdf_desc) = @_;
222
223 my $rdf_id = {};
224
225 # initialise any .cpd (=complex multi page) structures
226
227 foreach my $about_key (keys %{$rdf_desc}) {
228 if ($about_key =~ m/\.cpd$/) {
229 my $about = $rdf_desc->{$about_key};
230 my $id = $about->{'dc:identifier'};
231
232 if ($id =~ m/^\s*$/) {
233 # missing id, make one up based on about attribute
234
235 my ($tailname, $dirname, $suffix)
236 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
237
238 $id = "about:$tailname";
239 }
240
241 $rdf_id->{$id} = $about;
242 $rdf_id->{$id}->{'ex:filename'} = $about_key;
243 $rdf_id->{$id}->{'ex:type'} = "complex";
244 $rdf_id->{$id}->{'pages'} = [];
245 }
246
247 }
248
249 # now add in *non* .cpd items
250
251 foreach my $about_key (keys %{$rdf_desc}) {
252 if ($about_key !~ m/\.cpd$/) {
253 my $about = $rdf_desc->{$about_key};
254 my $id = $about->{'dc:identifier'};
255
256
257 if ($id =~ m/^\s*$/) {
258 # missing id, make one up based on about attribute
259
260 my ($tailname, $dirname, $suffix)
261 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
262
263 $id = "about:$tailname";
264 }
265
266 if (defined $rdf_id->{$id}) {
267 $about->{'ex:filename'} = $about_key;
268
269 # dealing with complex multi-page situation
270 # Add to existing structure
271
272 my $pages = $rdf_id->{$id}->{'pages'};
273 push(@$pages,$about)
274 }
275 else {
276 # New entry
277
278 $rdf_id->{$id} = $about;
279 $rdf_id->{$id}->{'ex:type'} = "simple";
280 $rdf_id->{$id}->{'ex:filename'} = $about_key;
281 }
282 }
283
284 }
285
286 return $rdf_id;
287}
288
289
290sub metadata_table_txt_file
291{
292 my $self = shift (@_);
293 my ($output_root,$page_num) = @_;
294
295 my $txt_filename = $output_root."_page_$page_num.txt";
296
297 my ($tailname, $dirname, $suffix)
298 = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
299
300 my $txt_file = "$tailname.$suffix";
301
302 return $txt_file;
303}
304
305
306sub output_metadata_table
307{
308 my $self = shift (@_);
309 my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
310
311 my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
312
313 open(TOUT,">$txt_filename")
314 || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
315
316 print TOUT $page->{'MetadataTable'};
317 delete $page->{'MetadataTable'};
318
319 close (TOUT);
320}
321
322
323sub rdf_id_to_item_file
324{
325 my $self = shift (@_);
326 my ($rdf_id,$tmp_dirname,$output_root) = @_;
327
328 my $item_file_list = [];
329
330 foreach my $id (keys %{$rdf_id}) {
331
332 my $id_safe = $id;
333 $id_safe =~ s/ /-/g;
334
335 my $output_filename = $output_root."_$id_safe.item";
336 open(FOUT,">$output_filename")
337 || die "Unable to open $output_filename: $!\n";
338
339
340 print FOUT "<PagedDocument>\n";
341
342 my $rdf_doc = $rdf_id->{$id};
343 foreach my $metadata_name (keys %$rdf_doc) {
344
345
346 next if ($metadata_name eq "pages");
347
348 my $metadata_value = $rdf_doc->{$metadata_name};
349
350 # convert ns:name to ns.Name
351 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
352
353 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
354 }
355
356 if ($rdf_doc->{'ex:type'} eq "complex") {
357 my $pages = $rdf_doc->{'pages'};
358 my $page_num = 1;
359
360 foreach my $page (@$pages) {
361
362 my $imgfile = $page->{'ex:filename'};
363 if ($imgfile =~ m/(http|ftp):/) {
364 $imgfile = "empty.jpg";
365 }
366 else {
367 $imgfile = &util::filename_cat("..","import",$imgfile);
368 }
369
370 my $txt_file
371 = $self->metadata_table_txt_file($output_root,$page_num);
372
373 $self->output_metadata_table($page,$page_num,
374 $tmp_dirname,$txt_file);
375
376
377 print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
378
379 foreach my $metadata_name (keys %$page) {
380
381 my $metadata_value = $rdf_doc->{$metadata_name};
382 # convert ns:name to ns.Name
383 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
384
385 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
386 }
387
388
389 $page_num++;
390
391
392 print FOUT " </Page>\n";
393 }
394 }
395 else {
396 # simple
397 # duplicate top-level metadata for now plus image to bind to
398
399 my $imgfile = $rdf_doc->{'ex:filename'};
400 if ($imgfile =~ m/(http|ftp):/) {
401 $imgfile = "empty.jpg";
402 }
403 else {
404 $imgfile = &util::filename_cat("..","import",$imgfile);
405 $imgfile = "../import/".$imgfile;
406 }
407
408
409 my $txt_file = $self->metadata_table_txt_file($output_root,1);
410 $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
411
412 print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
413 foreach my $metadata_name (keys %$rdf_doc) {
414
415 my $metadata_value = $rdf_doc->{$metadata_name};
416
417 # convert ns:name to ns.Name
418 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
419
420 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
421 }
422 print FOUT " </Page>\n";
423
424 }
425
426 print FOUT "</PagedDocument>\n";
427 close(FOUT);
428
429 push(@$item_file_list,$output_filename);
430
431 }
432
433
434 return $item_file_list;
435}
436
437
438
439sub xml_area_convert_file
440{
441 my $self = shift (@_);
442 my ($input_filename, $tmp_dirname, $output_root) = @_;
443
444 eval {
445 # Build up hash table/tree of all records
446
447 my $xslt = $self->{'xslt'};
448 if (defined $xslt && ($xslt ne "")) {
449 # perform xslt
450 my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
451
452 # feed transformed file (now in memory as string) into XML parser
453 $self->{'parser'}->parse($transformed_xml);
454 }
455 else {
456 $self->{'parser'}->parsefile($input_filename);
457 }
458 };
459
460 if ($@) {
461
462 # parsefile may either croak somewhere in XML::Parser (e.g. because
463 # the document is not well formed) or die somewhere in XMLPlug or a
464 # derived plugin (e.g. because we're attempting to process a
465 # document whose DOCTYPE is not meant for this plugin). For the
466 # first case we'll print a warning and continue, for the second
467 # we'll just continue quietly
468
469 print STDERR "**** Error is: $@\n";
470
471 my $file = $self->{'file'};
472
473 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
474 if (defined $msg) {
475 my $outhandle = $self->{'outhandle'};
476 my $plugin_name = ref ($self);
477 print $outhandle "$plugin_name failed to process $file ($msg)\n";
478 }
479
480 my $gli = $self->{'gli'};
481
482 # reset ourself for the next document
483 $self->{'section_level'}=0;
484 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
485 return ("fail",undef); # error during processing
486 }
487
488 my $rdf_desc = $self->{'rdf_desc'};
489
490# foreach my $about_key (keys %{$rdf_desc}) {
491# my $about = $rdf_desc->{$about_key};
492# foreach my $metadata_name (keys %{$about}) {
493#
494# my $metadata_value = $about->{$metadata_name};
495## print STDERR " $metadata_name: $metadata_value\n";
496# }
497# }
498
499
500 # Merge entries with same name
501
502
503 my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
504
505# foreach my $about_key (keys %{$merged_rdf_id}) {
506# my $about = $merged_rdf_id->{$about_key};
507# foreach my $metadata_name (keys %{$about}) {
508#
509# my $metadata_value = $about->{$metadata_name};
510## print STDERR " $metadata_name: $metadata_value\n";
511# }
512# }
513
514
515
516 my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
517 $output_root);
518
519 return ("item",$item_files);
520}
521
522
523# Override ConvertToPlug tmp_area_convert_file() to provide solution specific
524# to CONTENTdm
525#
526# A better (i.e. in the future) solution would be to see if this can be
527# shifted into gsConvert.pl so there is no need to override the
528# default tmp_area_convert_file()
529
530
531sub tmp_area_convert_file {
532 my $self = shift (@_);
533 my ($output_ext, $input_filename, $textref) = @_;
534
535 # is textref ever used?!?
536
537 my $outhandle = $self->{'outhandle'};
538 my $convert_to = $self->{'convert_to'};
539 my $failhandle = $self->{'failhandle'};
540 my $convert_to_ext = $self->{'convert_to_ext'};
541
542 # softlink to collection tmp dir
543 my $tmp_dirname
544 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
545 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
546
547 # derive tmp filename from input filename
548 my ($tailname, $dirname, $suffix)
549 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
550
551 # Remove any white space from filename -- no risk of name collision, and
552 # makes later conversion by utils simpler. Leave spaces in path...
553 # tidy up the filename with space, dot, hyphen between
554 $tailname =~ s/\s+//g;
555 $tailname =~ s/\.+//g;
556 $tailname =~ s/\-+//g;
557 $suffix = lc($suffix);
558 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
559
560 &util::soft_link($input_filename, $tmp_filename);
561 my $verbosity = $self->{'verbosity'};
562 if ($verbosity > 0) {
563 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
564 }
565
566 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
567
568 # call xml_area_convert_file rather than gsConvert.pl
569
570 my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
571
572 my ($output_type,$item_files)
573 = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
574
575
576 my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
577 my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
578
579 print STDERR "***** Setting up fake filename $fakeimg_filename -> $fakeimg_tmp_filename\n";
580
581 &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
582
583 # continue as before ...
584
585 # remove symbolic link to original file
586 &util::rm($tmp_filename);
587
588 # Check STDERR here
589 chomp $output_type;
590 if ($output_type eq "fail") {
591 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
592 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
593 $self->{'num_not_processed'} ++;
594 if (-s "$errlog") {
595 open(ERRLOG, "$errlog");
596 while (<ERRLOG>) {
597 print $outhandle "$_";
598 }
599 print $outhandle "\n";
600 close ERRLOG;
601 }
602 &util::rm("$errlog") if (-e "$errlog");
603 return [];
604 }
605
606 # store the *actual* output type and return the output filename
607 # it's possible we requested conversion to html, but only to text succeeded
608 #$self->{'convert_to_ext'} = $output_type;
609 if ($output_type =~ /html/i) {
610 $self->{'converted_to'} = "HTML";
611 } elsif ($output_type =~ /te?xt/i) {
612 $self->{'converted_to'} = "TEXT";
613 } elsif ($output_type =~ /item/i){
614 $self->{'converted_to'} = "PagedImg";
615 }
616
617
618 return $item_files;
619}
620
621
622
623
624# Override ConvertToPlug read
625# Needed so multiple .item files generate are sent down secondary plugin
626
627sub read {
628 my $self = shift (@_);
629 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
630
631 $self->{'gli'} = $gli;
632 $self->{'file'} = $file;
633
634 my $successful_rv = -1;
635
636 my $outhandle = $self->{'outhandle'};
637
638 my ($block_status,$filename) = $self->read_block(@_);
639 return $block_status if ((!defined $block_status) || ($block_status==0));
640 $file = $self->read_tidy_file($file);
641
642 # read() deviates at this point from ConvertToPlug
643 # Need to work with list of filename returned
644
645 my $output_ext = $self->{'convert_to_ext'};
646 my $conv_filename_list = [];
647
648 $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename);
649
650 if (scalar(@$conv_filename_list)==0) {
651 return -1;
652 } # had an error, will be passed down pipeline
653
654 foreach my $conv_filename ( @$conv_filename_list ) {
655 if (! -e "$conv_filename") {return -1;}
656 $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
657 $self->convert_post_process($conv_filename);
658
659 my $secondary_plugins = $self->{'secondary_plugins'};
660 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
661
662 if ($num_secondary_plugins == 0) {
663 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
664 return 0; # effectively block it
665 }
666
667 my @plugin_names = keys %$secondary_plugins;
668 my $plugin_name = shift @plugin_names;
669
670 if ($num_secondary_plugins > 1) {
671 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
672 }
673
674 my $secondary_plugin = $secondary_plugins->{$plugin_name};
675
676 # note: metadata is not carried on to the next level
677 my ($rv,$doc_obj)
678 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
679 $metadata, $processor, $maxdocs, $total_count,
680 $gli);
681
682 if ((defined $rv) && ($rv>=0)) {
683 $successful_rv = 1;
684 }
685
686 # Override previous gsdlsourcefilename set by secondary plugin
687 my $collect_file = &util::filename_within_collection($filename);
688 my $collect_conv_file = &util::filename_within_collection($conv_filename);
689 $doc_obj->set_source_filename ($collect_file);
690 $doc_obj->set_converted_filename($collect_conv_file);
691
692 my ($filemeta) = $file =~ /([^\\\/]+)$/;
693 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
694 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
695 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
696
697 if ($self->{'cover_image'}) {
698 $self->associate_cover_image($doc_obj, $filename);
699 }
700
701 # do plugin specific processing of doc_obj
702 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
703 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
704 return -1;
705 }
706 # do any automatic metadata extraction
707 $self->auto_extract_metadata ($doc_obj);
708
709 # have we found a Title??
710 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
711
712 # add an OID
713 $doc_obj->set_OID();
714 # process the document
715 $processor->process($doc_obj);
716
717 $self->{'num_processed'} ++;
718 }
719
720 return $successful_rv;
721}
722
723
724
725sub process {
726 my $self = shift (@_);
727 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
728
729
730 my $secondary_plugins = $self->{'secondary_plugins'};
731 my @plugin_names = keys %$secondary_plugins;
732 my $plugin_name = shift @plugin_names; # already checked there is only one
733
734 my $secondary_plugin = $secondary_plugins->{$plugin_name};
735
736 my $result = $secondary_plugin->process(@_);
737
738 return $result;
739}
740
741
742
743
744sub StartDocument {$self->xml_start_document(@_);}
745sub XMLDecl {$self->xml_xmldecl(@_);}
746sub Entity {$self->xml_entity(@_);}
747sub Doctype {$self->xml_doctype(@_);}
748sub StartTag {$self->xml_start_tag(@_);}
749sub EndTag {$self->xml_end_tag(@_);}
750sub Text {$self->xml_text(@_);}
751sub PI {$self->xml_pi(@_);}
752sub EndDocument {$self->xml_end_document(@_);}
753sub Default {$self->xml_default(@_);}
754
755# This Char function overrides the one in XML::Parser::Stream to overcome a
756# problem where $expat->{Text} is treated as the return value, slowing
757# things down significantly in some cases.
758sub Char {
759 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
760 $_[0]->{'Text'} .= $_[1];
761 return undef;
762}
763
764# Called at the beginning of the XML document.
765sub xml_start_document {
766 my $self = shift(@_);
767 my ($expat) = @_;
768
769 $self->{'rdf_desc'} = {};
770}
771
772# Called for XML declarations
773sub xml_xmldecl {
774 my $self = shift(@_);
775 my ($expat, $version, $encoding, $standalone) = @_;
776}
777
778# Called for XML entities
779sub xml_entity {
780 my $self = shift(@_);
781 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
782}
783
784# Called for DOCTYPE declarations - use die to bail out if this doctype
785# is not meant for this plugin
786sub xml_doctype {
787 my $self = shift(@_);
788 my ($expat, $name, $sysid, $pubid, $internal) = @_;
789
790 die "" if ($name !~ /^rdf:RDF$/);
791
792 my $outhandle = $self->{'outhandle'};
793 print $outhandle "CONTENTdmPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
794
795}
796
797# Called for every start tag. The $_ variable will contain a copy of the
798# tag and the %_ variable will contain the element's attributes.
799sub xml_start_tag {
800 my $self = shift(@_);
801 my ($expat, $element) = @_;
802
803 if ($element eq "rdf:Description") {
804
805 my $about_key = $_{'about'};
806
807 my $rdf_desc = $self->{'rdf_desc'};
808 $rdf_desc->{$about_key} = {};
809
810 $self->{'about_key'} = $about_key;
811 $self->{'index_text'} = "";
812 $self->{'pp_text'} = "<table width=\"100%\">\n";
813
814
815 }
816 elsif (defined $self->{'about_key'}) {
817 $self->{'metadata_name'} = $element;
818 $self->{'metadata_value'} = "";
819 }
820
821}
822
823# Called for every end tag. The $_ variable will contain a copy of the tag.
824sub xml_end_tag {
825 my $self = shift(@_);
826 my ($expat, $element) = @_;
827
828 if ($element eq "rdf:Description") {
829 $self->{'pp_text'} .= "</table>\n";
830 ## ghtml::htmlsafe($self->{'pp_text'});
831
832
833 my $about_key = $self->{'about_key'};
834 my $about = $self->{'rdf_desc'}->{$about_key};
835 $about->{'IndexText'} = $self->{'index_text'};
836 $about->{'MetadataTable'} = $self->{'pp_text'};
837
838
839 $self->{'about_key'} = undef;
840 $self->{'index_text'} = undef;
841 $self->{'pp_text'} = undef;
842
843 }
844 elsif (defined $self->{'metadata_name'}) {
845 my $metadata_name = $self->{'metadata_name'};
846 if ($element eq $metadata_name) {
847 my $metadata_value = $self->{'metadata_value'};
848
849 my $about_key = $self->{'about_key'};
850 my $about = $self->{'rdf_desc'}->{$about_key};
851 $about->{$metadata_name} = $metadata_value;
852
853 $self->{'index_text'} .= "$metadata_value\n";
854 $self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
855
856 $self->{'metadata_name'} = undef;
857 $self->{'metadata_value'} = undef;
858 }
859 }
860}
861
862# Called just before start or end tags with accumulated non-markup text in
863# the $_ variable.
864sub xml_text {
865 my $self = shift(@_);
866 my ($expat) = @_;
867
868 if (defined $self->{'metadata_name'}) {
869 $self->{'metadata_value'} .= $_;
870 }
871}
872
873# Called for processing instructions. The $_ variable will contain a copy
874# of the pi.
875sub xml_pi {
876 my $self = shift(@_);
877 my ($expat, $target, $data) = @_;
878}
879
880# Called at the end of the XML document.
881sub xml_end_document {
882 my $self = shift(@_);
883 my ($expat) = @_;
884
885}
886
887# Called for any characters not handled by the above functions.
888sub xml_default {
889 my $self = shift(@_);
890 my ($expat, $text) = @_;
891}
892
893
8941;
Note: See TracBrowser for help on using the repository browser.