source: gsdl/trunk/perllib/plugins/CONTENTdmPlug.pm@ 15114

Last change on this file since 15114 was 15114, checked in by ak19, 16 years ago

Added srcreplaceable option to work with new script replace_srcdoc_with_html.pl

  • Property svn:keywords set to Author Date Id Revision
File size: 23.6 KB
Line 
1###########################################################################
2#
3# CONTENTdmPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlug;
26
27use ConvertToPlug;
28use unicode;
29use ghtml;
30
31use strict;
32no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
33
34
35use XMLParser;
36
37sub BEGIN {
38 @CONTENTdmPlug::ISA = ('ConvertToPlug');
39}
40
41
42my $convert_to_list =
43 [ { 'name' => "auto",
44 'desc' => "{ConvertToPlug.convert_to.auto}" },
45 { 'name' => "html",
46 'desc' => "{ConvertToPlug.convert_to.html}" },
47 { 'name' => "text",
48 'desc' => "{ConvertToPlug.convert_to.text}" },
49 { 'name' => "pagedimg",
50 'desc' => "{ConvertToPlug.convert_to.pagedimg}"},
51 ];
52
53
54
55my $arguments =
56 [
57 { 'name' => "convert_to",
58 'desc' => "{ConvertToPlug.convert_to}",
59 'type' => "enum",
60 'reqd' => "yes",
61 'list' => $convert_to_list,
62 'deft' => "html" },
63 { 'name' => "xslt",
64 'desc' => "{XMLPlug.xslt}",
65 'type' => "string",
66 'deft' => "",
67 'reqd' => "no" },
68 { 'name' => "process_exp",
69 'desc' => "{BasPlug.process_exp}",
70 'type' => "regexp",
71 'deft' => &get_default_process_exp(),
72 'reqd' => "no" },
73 { 'name' => "block_exp",
74 'desc' => "{BasPlug.block_exp}",
75 'type' => "regexp",
76 'deft' => &get_default_block_exp() }
77];
78
79my $options = { 'name' => "CONTENTdmPlug",
80 'desc' => "{CONTENTdmPlug.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 # CONTENTdmPlug is one of the few ConvertToPlug subclasses whose source doc can't be replaced by a GS-generated html
84 'srcreplaceable' => "no",
85 'args' => $arguments };
86
87our ($self);
88
89sub new {
90 my ($class) = shift (@_);
91 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92 push(@$pluginlist, $class);
93
94 push(@$inputargs,"-title_sub");
95 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
96
97 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
98 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
99
100 my @arg_array = @$inputargs;
101 $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
102
103 if ($self->{'info_only'}) {
104 # don't worry about any options etc
105 return bless $self, $class;
106 }
107
108 my $parser = new XML::Parser('Style' => 'Stream',
109 'Handlers' => {'Char' => \&Char,
110 'XMLDecl' => \&XMLDecl,
111 'Entity' => \&Entity,
112 'Doctype' => \&Doctype,
113 'Default' => \&Default
114 });
115 $self->{'parser'} = $parser;
116 $self->{'rdf_desc'} = undef;
117 $self->{'about_key'} = undef;
118 $self->{'metadata_name'} = undef;
119 $self->{'metadata_value'} = undef;
120
121 $self->{'convert_to'} = "PagedImg";
122 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
123
124 if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
125 $secondary_plugin_options->{'PagedImgPlug'} = [];
126 my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
127 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
128 push(@$pagedimg_options, "-thumbnail", "-screenview");
129 }
130
131 $self = bless $self, $class;
132 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
133 return $self;
134}
135
136sub get_default_process_exp {
137 my $self = shift (@_);
138
139 return q^(?i)\.rdf$^;
140}
141
142# so we don't inherit HTMLPlug's block exp...
143sub get_default_block_exp {
144 return q^(?i)\.(jpg|jpeg|gif)$^;
145}
146
147
148
149
150# A smarter (?) option would be to add XMLPlug into inheritence above
151# thereby avoiding a fair amount of code duplication
152
153sub apply_xslt
154{
155 my $self = shift @_;
156 my ($xslt,$filename) = @_;
157
158 my $outhandle = $self->{'outhandle'};
159
160 my $xslt_filename = $xslt;
161
162 if (! -e $xslt_filename) {
163 # Look in main site directory
164 my $gsdlhome = $ENV{'GSDLHOME'};
165 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
166 }
167
168 if (! -e $xslt_filename) {
169 # Look in collection directory
170 my $coldir = $ENV{'GSDLCOLLECTDIR'};
171 $xslt_filename = &util::filename_cat($coldir,$xslt);
172 }
173
174 if (! -e $xslt_filename) {
175 print $outhandle "Warning: Unable to find XSLT $xslt\n";
176 if (open(XMLIN,"<$filename")) {
177
178 my $untransformed_xml = "";
179 while (defined (my $line = <XMLIN>)) {
180
181 $untransformed_xml .= $line;
182 }
183 close(XMLIN);
184
185 return $untransformed_xml;
186 }
187 else {
188 print $outhandle "Error: Unable to open file $filename\n";
189 print $outhandle " $!\n";
190 return "";
191 }
192
193 }
194
195 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
196 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
197 my $xslt_base_cmd = "java -jar $jar_filename";
198 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
199
200 my $transformed_xml = "";
201
202 if (open(XSLT_IN,"$xslt_cmd |")) {
203 while (defined (my $line = <XSLT_IN>)) {
204
205 $transformed_xml .= $line;
206 }
207 close(XSLT_IN);
208 }
209 else {
210 print $outhandle "Error: Unable to run command $xslt_cmd\n";
211 print $outhandle " $!\n";
212 }
213
214 return $transformed_xml;
215
216}
217
218
219
220sub rdf_desc_to_id
221{
222 my $self = shift (@_);
223 my ($rdf_desc) = @_;
224
225 my $rdf_id = {};
226
227 # initialise any .cpd (=complex multi page) structures
228
229 foreach my $about_key (keys %{$rdf_desc}) {
230 if ($about_key =~ m/\.cpd$/) {
231 my $about = $rdf_desc->{$about_key};
232 my $id = $about->{'dc:identifier'};
233
234 if ($id =~ m/^\s*$/) {
235 # missing id, make one up based on about attribute
236
237 my ($tailname, $dirname, $suffix)
238 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
239
240 $id = "about:$tailname";
241 }
242
243 $rdf_id->{$id} = $about;
244 $rdf_id->{$id}->{'ex:filename'} = $about_key;
245 $rdf_id->{$id}->{'ex:type'} = "complex";
246 $rdf_id->{$id}->{'pages'} = [];
247 }
248
249 }
250
251 # now add in *non* .cpd items
252
253 foreach my $about_key (keys %{$rdf_desc}) {
254 if ($about_key !~ m/\.cpd$/) {
255 my $about = $rdf_desc->{$about_key};
256 my $id = $about->{'dc:identifier'};
257
258
259 if ($id =~ m/^\s*$/) {
260 # missing id, make one up based on about attribute
261
262 my ($tailname, $dirname, $suffix)
263 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
264
265 $id = "about:$tailname";
266 }
267
268 if (defined $rdf_id->{$id}) {
269 $about->{'ex:filename'} = $about_key;
270
271 # dealing with complex multi-page situation
272 # Add to existing structure
273
274 my $pages = $rdf_id->{$id}->{'pages'};
275 push(@$pages,$about)
276 }
277 else {
278 # New entry
279
280 $rdf_id->{$id} = $about;
281 $rdf_id->{$id}->{'ex:type'} = "simple";
282 $rdf_id->{$id}->{'ex:filename'} = $about_key;
283 }
284 }
285
286 }
287
288 return $rdf_id;
289}
290
291
292sub metadata_table_txt_file
293{
294 my $self = shift (@_);
295 my ($output_root,$page_num) = @_;
296
297 my $txt_filename = $output_root."_page_$page_num.txt";
298
299 my ($tailname, $dirname, $suffix)
300 = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
301
302 my $txt_file = "$tailname.$suffix";
303
304 return $txt_file;
305}
306
307
308sub output_metadata_table
309{
310 my $self = shift (@_);
311 my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
312
313 my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
314
315 open(TOUT,">$txt_filename")
316 || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
317
318 print TOUT $page->{'MetadataTable'};
319 delete $page->{'MetadataTable'};
320
321 close (TOUT);
322}
323
324
325sub rdf_id_to_item_file
326{
327 my $self = shift (@_);
328 my ($rdf_id,$tmp_dirname,$output_root) = @_;
329
330 my $item_file_list = [];
331
332 foreach my $id (keys %{$rdf_id}) {
333
334 my $id_safe = $id;
335 $id_safe =~ s/ /-/g;
336
337 my $output_filename = $output_root."_$id_safe.item";
338 open(FOUT,">$output_filename")
339 || die "Unable to open $output_filename: $!\n";
340
341
342 print FOUT "<PagedDocument>\n";
343
344 my $rdf_doc = $rdf_id->{$id};
345 foreach my $metadata_name (keys %$rdf_doc) {
346
347
348 next if ($metadata_name eq "pages");
349
350 my $metadata_value = $rdf_doc->{$metadata_name};
351
352 # convert ns:name to ns.Name
353 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
354
355 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
356 }
357
358 if ($rdf_doc->{'ex:type'} eq "complex") {
359 my $pages = $rdf_doc->{'pages'};
360 my $page_num = 1;
361
362 foreach my $page (@$pages) {
363
364 my $imgfile = $page->{'ex:filename'};
365 if ($imgfile =~ m/(http|ftp):/) {
366 $imgfile = "empty.jpg";
367 }
368 else {
369 $imgfile = &util::filename_cat("..","import",$imgfile);
370 }
371
372 my $txt_file
373 = $self->metadata_table_txt_file($output_root,$page_num);
374
375 $self->output_metadata_table($page,$page_num,
376 $tmp_dirname,$txt_file);
377
378
379 print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
380
381 foreach my $metadata_name (keys %$page) {
382
383 my $metadata_value = $rdf_doc->{$metadata_name};
384 # convert ns:name to ns.Name
385 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
386
387 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
388 }
389
390
391 $page_num++;
392
393
394 print FOUT " </Page>\n";
395 }
396 }
397 else {
398 # simple
399 # duplicate top-level metadata for now plus image to bind to
400
401 my $imgfile = $rdf_doc->{'ex:filename'};
402 if ($imgfile =~ m/(http|ftp):/) {
403 $imgfile = "empty.jpg";
404 }
405 else {
406 $imgfile = &util::filename_cat("..","import",$imgfile);
407 $imgfile = "../import/".$imgfile;
408 }
409
410
411 my $txt_file = $self->metadata_table_txt_file($output_root,1);
412 $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
413
414 print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
415 foreach my $metadata_name (keys %$rdf_doc) {
416
417 my $metadata_value = $rdf_doc->{$metadata_name};
418
419 # convert ns:name to ns.Name
420 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
421
422 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
423 }
424 print FOUT " </Page>\n";
425
426 }
427
428 print FOUT "</PagedDocument>\n";
429 close(FOUT);
430
431 push(@$item_file_list,$output_filename);
432
433 }
434
435
436 return $item_file_list;
437}
438
439
440
441sub xml_area_convert_file
442{
443 my $self = shift (@_);
444 my ($input_filename, $tmp_dirname, $output_root) = @_;
445
446 eval {
447 # Build up hash table/tree of all records
448
449 my $xslt = $self->{'xslt'};
450 if (defined $xslt && ($xslt ne "")) {
451 # perform xslt
452 my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
453
454 # feed transformed file (now in memory as string) into XML parser
455 $self->{'parser'}->parse($transformed_xml);
456 }
457 else {
458 $self->{'parser'}->parsefile($input_filename);
459 }
460 };
461
462 if ($@) {
463
464 # parsefile may either croak somewhere in XML::Parser (e.g. because
465 # the document is not well formed) or die somewhere in XMLPlug or a
466 # derived plugin (e.g. because we're attempting to process a
467 # document whose DOCTYPE is not meant for this plugin). For the
468 # first case we'll print a warning and continue, for the second
469 # we'll just continue quietly
470
471 print STDERR "**** Error is: $@\n";
472
473 my $file = $self->{'file'};
474
475 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
476 if (defined $msg) {
477 my $outhandle = $self->{'outhandle'};
478 my $plugin_name = ref ($self);
479 print $outhandle "$plugin_name failed to process $file ($msg)\n";
480 }
481
482 my $gli = $self->{'gli'};
483
484 # reset ourself for the next document
485 $self->{'section_level'}=0;
486 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
487 return ("fail",undef); # error during processing
488 }
489
490 my $rdf_desc = $self->{'rdf_desc'};
491
492# foreach my $about_key (keys %{$rdf_desc}) {
493# my $about = $rdf_desc->{$about_key};
494# foreach my $metadata_name (keys %{$about}) {
495#
496# my $metadata_value = $about->{$metadata_name};
497## print STDERR " $metadata_name: $metadata_value\n";
498# }
499# }
500
501
502 # Merge entries with same name
503
504
505 my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
506
507# foreach my $about_key (keys %{$merged_rdf_id}) {
508# my $about = $merged_rdf_id->{$about_key};
509# foreach my $metadata_name (keys %{$about}) {
510#
511# my $metadata_value = $about->{$metadata_name};
512## print STDERR " $metadata_name: $metadata_value\n";
513# }
514# }
515
516
517
518 my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
519 $output_root);
520
521 return ("item",$item_files);
522}
523
524
525# Override ConvertToPlug tmp_area_convert_file() to provide solution specific
526# to CONTENTdm
527#
528# A better (i.e. in the future) solution would be to see if this can be
529# shifted into gsConvert.pl so there is no need to override the
530# default tmp_area_convert_file()
531
532
533sub tmp_area_convert_file {
534 my $self = shift (@_);
535 my ($output_ext, $input_filename, $textref) = @_;
536
537 # is textref ever used?!?
538
539 my $outhandle = $self->{'outhandle'};
540 my $convert_to = $self->{'convert_to'};
541 my $failhandle = $self->{'failhandle'};
542 my $convert_to_ext = $self->{'convert_to_ext'};
543
544 # softlink to collection tmp dir
545 my $tmp_dirname
546 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
547 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
548
549 # derive tmp filename from input filename
550 my ($tailname, $dirname, $suffix)
551 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
552
553 # Remove any white space from filename -- no risk of name collision, and
554 # makes later conversion by utils simpler. Leave spaces in path...
555 # tidy up the filename with space, dot, hyphen between
556 $tailname =~ s/\s+//g;
557 $tailname =~ s/\.+//g;
558 $tailname =~ s/\-+//g;
559 $suffix = lc($suffix);
560 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
561
562 &util::soft_link($input_filename, $tmp_filename);
563 my $verbosity = $self->{'verbosity'};
564 if ($verbosity > 0) {
565 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
566 }
567
568 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
569
570 # call xml_area_convert_file rather than gsConvert.pl
571
572 my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
573
574 my ($output_type,$item_files)
575 = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
576
577
578 my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
579 my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
580
581 print STDERR "***** Setting up fake filename $fakeimg_filename -> $fakeimg_tmp_filename\n";
582
583 &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
584
585 # continue as before ...
586
587 # remove symbolic link to original file
588 &util::rm($tmp_filename);
589
590 # Check STDERR here
591 chomp $output_type;
592 if ($output_type eq "fail") {
593 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
594 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
595 $self->{'num_not_processed'} ++;
596 if (-s "$errlog") {
597 open(ERRLOG, "$errlog");
598 while (<ERRLOG>) {
599 print $outhandle "$_";
600 }
601 print $outhandle "\n";
602 close ERRLOG;
603 }
604 &util::rm("$errlog") if (-e "$errlog");
605 return [];
606 }
607
608 # store the *actual* output type and return the output filename
609 # it's possible we requested conversion to html, but only to text succeeded
610 #$self->{'convert_to_ext'} = $output_type;
611 if ($output_type =~ /html/i) {
612 $self->{'converted_to'} = "HTML";
613 } elsif ($output_type =~ /te?xt/i) {
614 $self->{'converted_to'} = "TEXT";
615 } elsif ($output_type =~ /item/i){
616 $self->{'converted_to'} = "PagedImg";
617 }
618
619
620 return $item_files;
621}
622
623
624
625
626# Override ConvertToPlug read
627# Needed so multiple .item files generate are sent down secondary plugin
628
629sub read {
630 my $self = shift (@_);
631 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
632
633 $self->{'gli'} = $gli;
634 $self->{'file'} = $file;
635
636 my $successful_rv = -1;
637
638 my $outhandle = $self->{'outhandle'};
639
640 my ($block_status,$filename) = $self->read_block(@_);
641 return $block_status if ((!defined $block_status) || ($block_status==0));
642 $file = $self->read_tidy_file($file);
643
644 # read() deviates at this point from ConvertToPlug
645 # Need to work with list of filename returned
646
647 my $output_ext = $self->{'convert_to_ext'};
648 my $conv_filename_list = [];
649
650 $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename);
651
652 if (scalar(@$conv_filename_list)==0) {
653 return -1;
654 } # had an error, will be passed down pipeline
655
656 foreach my $conv_filename ( @$conv_filename_list ) {
657 if (! -e "$conv_filename") {return -1;}
658 $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
659 $self->convert_post_process($conv_filename);
660
661 my $secondary_plugins = $self->{'secondary_plugins'};
662 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
663
664 if ($num_secondary_plugins == 0) {
665 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
666 return 0; # effectively block it
667 }
668
669 my @plugin_names = keys %$secondary_plugins;
670 my $plugin_name = shift @plugin_names;
671
672 if ($num_secondary_plugins > 1) {
673 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
674 }
675
676 my $secondary_plugin = $secondary_plugins->{$plugin_name};
677
678 # note: metadata is not carried on to the next level
679 my ($rv,$doc_obj)
680 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
681 $metadata, $processor, $maxdocs, $total_count,
682 $gli);
683
684 if ((defined $rv) && ($rv>=0)) {
685 $successful_rv = 1;
686 }
687
688 # Override previous gsdlsourcefilename set by secondary plugin
689 my $collect_file = &util::filename_within_collection($filename);
690 my $collect_conv_file = &util::filename_within_collection($conv_filename);
691 $doc_obj->set_source_filename ($collect_file);
692 $doc_obj->set_converted_filename($collect_conv_file);
693
694 my ($filemeta) = $file =~ /([^\\\/]+)$/;
695 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
696 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
697 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
698
699 if ($self->{'cover_image'}) {
700 $self->associate_cover_image($doc_obj, $filename);
701 }
702
703 # do plugin specific processing of doc_obj
704 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
705 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
706 return -1;
707 }
708 # do any automatic metadata extraction
709 $self->auto_extract_metadata ($doc_obj);
710
711 # have we found a Title??
712 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
713
714 # add an OID
715 $doc_obj->set_OID();
716 # process the document
717 $processor->process($doc_obj);
718
719 $self->{'num_processed'} ++;
720 }
721
722 return $successful_rv;
723}
724
725
726
727sub process {
728 my $self = shift (@_);
729 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
730
731
732 my $secondary_plugins = $self->{'secondary_plugins'};
733 my @plugin_names = keys %$secondary_plugins;
734 my $plugin_name = shift @plugin_names; # already checked there is only one
735
736 my $secondary_plugin = $secondary_plugins->{$plugin_name};
737
738 my $result = $secondary_plugin->process(@_);
739
740 return $result;
741}
742
743
744
745
746sub StartDocument {$self->xml_start_document(@_);}
747sub XMLDecl {$self->xml_xmldecl(@_);}
748sub Entity {$self->xml_entity(@_);}
749sub Doctype {$self->xml_doctype(@_);}
750sub StartTag {$self->xml_start_tag(@_);}
751sub EndTag {$self->xml_end_tag(@_);}
752sub Text {$self->xml_text(@_);}
753sub PI {$self->xml_pi(@_);}
754sub EndDocument {$self->xml_end_document(@_);}
755sub Default {$self->xml_default(@_);}
756
757# This Char function overrides the one in XML::Parser::Stream to overcome a
758# problem where $expat->{Text} is treated as the return value, slowing
759# things down significantly in some cases.
760sub Char {
761 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
762 $_[0]->{'Text'} .= $_[1];
763 return undef;
764}
765
766# Called at the beginning of the XML document.
767sub xml_start_document {
768 my $self = shift(@_);
769 my ($expat) = @_;
770
771 $self->{'rdf_desc'} = {};
772}
773
774# Called for XML declarations
775sub xml_xmldecl {
776 my $self = shift(@_);
777 my ($expat, $version, $encoding, $standalone) = @_;
778}
779
780# Called for XML entities
781sub xml_entity {
782 my $self = shift(@_);
783 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
784}
785
786# Called for DOCTYPE declarations - use die to bail out if this doctype
787# is not meant for this plugin
788sub xml_doctype {
789 my $self = shift(@_);
790 my ($expat, $name, $sysid, $pubid, $internal) = @_;
791
792 die "" if ($name !~ /^rdf:RDF$/);
793
794 my $outhandle = $self->{'outhandle'};
795 print $outhandle "CONTENTdmPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
796
797}
798
799# Called for every start tag. The $_ variable will contain a copy of the
800# tag and the %_ variable will contain the element's attributes.
801sub xml_start_tag {
802 my $self = shift(@_);
803 my ($expat, $element) = @_;
804
805 if ($element eq "rdf:Description") {
806
807 my $about_key = $_{'about'};
808
809 my $rdf_desc = $self->{'rdf_desc'};
810 $rdf_desc->{$about_key} = {};
811
812 $self->{'about_key'} = $about_key;
813 $self->{'index_text'} = "";
814 $self->{'pp_text'} = "<table width=\"100%\">\n";
815
816
817 }
818 elsif (defined $self->{'about_key'}) {
819 $self->{'metadata_name'} = $element;
820 $self->{'metadata_value'} = "";
821 }
822
823}
824
825# Called for every end tag. The $_ variable will contain a copy of the tag.
826sub xml_end_tag {
827 my $self = shift(@_);
828 my ($expat, $element) = @_;
829
830 if ($element eq "rdf:Description") {
831 $self->{'pp_text'} .= "</table>\n";
832 ## ghtml::htmlsafe($self->{'pp_text'});
833
834
835 my $about_key = $self->{'about_key'};
836 my $about = $self->{'rdf_desc'}->{$about_key};
837 $about->{'IndexText'} = $self->{'index_text'};
838 $about->{'MetadataTable'} = $self->{'pp_text'};
839
840
841 $self->{'about_key'} = undef;
842 $self->{'index_text'} = undef;
843 $self->{'pp_text'} = undef;
844
845 }
846 elsif (defined $self->{'metadata_name'}) {
847 my $metadata_name = $self->{'metadata_name'};
848 if ($element eq $metadata_name) {
849 my $metadata_value = $self->{'metadata_value'};
850
851 my $about_key = $self->{'about_key'};
852 my $about = $self->{'rdf_desc'}->{$about_key};
853 $about->{$metadata_name} = $metadata_value;
854
855 $self->{'index_text'} .= "$metadata_value\n";
856 $self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
857
858 $self->{'metadata_name'} = undef;
859 $self->{'metadata_value'} = undef;
860 }
861 }
862}
863
864# Called just before start or end tags with accumulated non-markup text in
865# the $_ variable.
866sub xml_text {
867 my $self = shift(@_);
868 my ($expat) = @_;
869
870 if (defined $self->{'metadata_name'}) {
871 $self->{'metadata_value'} .= $_;
872 }
873}
874
875# Called for processing instructions. The $_ variable will contain a copy
876# of the pi.
877sub xml_pi {
878 my $self = shift(@_);
879 my ($expat, $target, $data) = @_;
880}
881
882# Called at the end of the XML document.
883sub xml_end_document {
884 my $self = shift(@_);
885 my ($expat) = @_;
886
887}
888
889# Called for any characters not handled by the above functions.
890sub xml_default {
891 my $self = shift(@_);
892 my ($expat, $text) = @_;
893}
894
895
8961;
Note: See TracBrowser for help on using the repository browser.