root/gsdl/trunk/perllib/plugins/CONTENTdmPlugin.pm @ 20790

Revision 20790, 21.8 KB (checked in by kjdon, 10 years ago)

set -processing_tmp_files option to secondary HTML and PagedImage? plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlugin;
26
27use ConvertBinaryFile;
28use ReadXMLFile;
29
30use unicode;
31use ghtml;
32
33use strict;
34no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37use XMLParser;
38
39# inherit ReadXMLFile for the apply_xslt method
40sub BEGIN {
41    @CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42}
43
44
45my $convert_to_list =
46    [ { 'name' => "auto",
47    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
48      { 'name' => "html",
49    'desc' => "{ConvertBinaryFile.convert_to.html}" },
50      { 'name' => "text",
51    'desc' => "{ConvertBinaryFile.convert_to.text}" },
52      { 'name' => "pagedimg",
53    'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
54      ];
55
56
57
58my $arguments =
59      [
60       { 'name' => "convert_to",
61    'desc' => "{ConvertBinaryFile.convert_to}",
62    'type' => "enum",
63    'reqd' => "yes",
64    'list' => $convert_to_list,
65    'deft' => "html" }, 
66      { 'name' => "xslt",
67    'desc' => "{ReadXMLFile.xslt}",
68    'type' => "string",
69    'deft' => "",
70    'reqd' => "no" },
71       { 'name' => "process_exp",
72    'desc' => "{BasePlugin.process_exp}",
73    'type' => "regexp",
74    'deft' => &get_default_process_exp(),
75    'reqd' => "no" },
76      { 'name' => "block_exp",
77    'desc' => "{BasePlugin.block_exp}",
78    'type' => "regexp",
79    'deft' => &get_default_block_exp() }
80];
81
82my $options = { 'name'     => "CONTENTdmPlugin",
83        'desc'     => "{CONTENTdmPlugin.desc}",
84        'abstract' => "no",
85        'inherits' => "yes",
86        # CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
87        'srcreplaceable' => "no",
88        'args'     => $arguments };
89
90sub new {
91    my ($class) = shift (@_);
92    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93    push(@$pluginlist, $class);
94
95    push(@$inputargs,"-title_sub");
96    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
97
98    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99    push(@{$hashArgOptLists->{"OptList"}},$options);
100
101    my @arg_array = @$inputargs;
102    my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
103   
104    if ($self->{'info_only'}) {
105    # don't worry about any options etc
106    return bless $self, $class;
107    }
108
109    my $parser = new XML::Parser('Style' => 'Stream',
110                 'Pkg' => 'ReadXMLFile',
111                 'PluginObj' => $self,
112                 'Handlers' => {'Char' => \&ReadXMLFile::Char,
113                        'XMLDecl' => \&ReadXMLFile::XMLDecl,
114                        'Entity'  => \&ReadXMLFile::Entity,
115                        'Doctype' => \&ReadXMLFile::Doctype,
116                        'Default' => \&ReadXMLFile::Default
117                        }); 
118    $self->{'parser'} = $parser;
119
120
121    $self->{'rdf_desc'} = undef;
122    $self->{'about_key'} = undef;
123    $self->{'metadata_name'} = undef;
124    $self->{'metadata_value'} = undef;
125
126    $self->{'convert_to'} = "PagedImage";
127    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
128
129    if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
130    $secondary_plugin_options->{'PagedImagePlugin'} = [];
131    }
132    my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
133    push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134    push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
135    push(@$pagedimg_options, "-file_rename_method", "none");
136    push(@$pagedimg_options, "-processing_tmp_files");
137    $self = bless $self, $class;
138
139# ***** no longer needed!
140#    # This needs to be done after blss, to $self passed to XML::Parser
141#    # can correctly resolve the right call-back methods during XML parsing
142
143
144    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145    return $self;
146}
147
148sub get_default_process_exp {
149    my $self = shift (@_);
150
151    return q^(?i)\.rdf$^;
152}
153
154sub get_default_block_exp {
155    return q^(?i)\.(jpg|jpeg|gif)$^;
156}
157
158
159
160sub rdf_desc_to_id
161{
162    my $self = shift (@_);
163    my ($rdf_desc) = @_;
164
165    my $rdf_id = {};
166
167    # initialise any .cpd (=complex multi page) structures
168
169    foreach my $about_key (keys %{$rdf_desc}) {
170    if ($about_key =~ m/\.cpd$/) {
171        my $about = $rdf_desc->{$about_key};
172        my $id    = $about->{'dc:identifier'};
173
174        if ($id =~ m/^\s*$/) {
175        # missing id, make one up based on about attribute
176
177        my ($tailname, $dirname, $suffix)
178            = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
179
180        $id = "about:$tailname";
181        }
182
183        $rdf_id->{$id} = $about;
184        $rdf_id->{$id}->{'ex:filename'} = $about_key;
185        $rdf_id->{$id}->{'ex:type'} = "complex";
186        $rdf_id->{$id}->{'pages'} = [];
187    }
188
189    }
190
191    # now add in *non* .cpd items
192
193    foreach my $about_key (keys %{$rdf_desc}) {
194    if ($about_key !~ m/\.cpd$/) {
195        my $about = $rdf_desc->{$about_key};       
196        my $id    = $about->{'dc:identifier'};
197
198
199        if ($id =~ m/^\s*$/) {
200        # missing id, make one up based on about attribute
201
202        my ($tailname, $dirname, $suffix)
203            = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
204
205        $id = "about:$tailname";
206        }
207
208        if (defined $rdf_id->{$id}) {
209        $about->{'ex:filename'} = $about_key;
210
211        # dealing with complex multi-page situation
212        # Add to existing structure
213
214        my $pages = $rdf_id->{$id}->{'pages'};
215        push(@$pages,$about)
216        }
217        else {
218        # New entry
219
220        $rdf_id->{$id} = $about;
221        $rdf_id->{$id}->{'ex:type'} = "simple";     
222        $rdf_id->{$id}->{'ex:filename'} = $about_key;
223        }
224    }
225   
226    }
227
228    return $rdf_id;
229}
230
231
232sub metadata_table_txt_file
233{
234    my $self = shift (@_);
235    my ($output_root,$page_num) = @_;
236
237    my $txt_filename = $output_root."_page_$page_num.txt";
238
239    my ($tailname, $dirname, $suffix)
240    = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
241
242    my $txt_file = "$tailname$suffix";
243
244    return $txt_file;
245}
246
247
248sub output_metadata_table
249{
250    my $self = shift (@_);
251    my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
252   
253    my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
254
255    open(TOUT,">$txt_filename")
256    || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
257   
258    print TOUT $page->{'MetadataTable'};
259    delete $page->{'MetadataTable'};
260
261    close (TOUT);
262}
263
264
265sub rdf_id_to_item_file
266{
267    my $self = shift (@_);
268    my ($rdf_id,$tmp_dirname,$output_root) = @_;
269
270    my $item_file_list = [];
271   
272    foreach my $id (keys %{$rdf_id}) {
273
274    my $id_safe = $id;
275    $id_safe =~ s/ /-/g;
276
277    my $output_filename = $output_root."_$id_safe.item";
278    open(FOUT,">$output_filename")
279        || die "Unable to open $output_filename: $!\n";
280   
281
282    print FOUT "<PagedDocument>\n";
283
284    my $rdf_doc = $rdf_id->{$id};
285    foreach my $metadata_name (keys %$rdf_doc) {
286
287
288        next if ($metadata_name eq "pages");
289
290        my $metadata_value = $rdf_doc->{$metadata_name};
291
292        # convert ns:name to ns.Name
293        $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
294
295        print FOUT "  <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
296    }
297
298    if ($rdf_doc->{'ex:type'} eq "complex") {
299        my $pages = $rdf_doc->{'pages'};
300        my $page_num = 1;
301
302        foreach my $page (@$pages) {
303
304        my $imgfile = $page->{'ex:filename'};
305        if ($imgfile =~ m/(http|ftp):/) {
306            $imgfile = "empty.jpg";
307        }
308        else {
309            $imgfile = &util::filename_cat("..","import",$imgfile);
310        }
311       
312        my $txt_file
313            = $self->metadata_table_txt_file($output_root,$page_num);
314
315        $self->output_metadata_table($page,$page_num,
316                         $tmp_dirname,$txt_file);
317
318
319        print FOUT "  <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
320
321        foreach my $metadata_name (keys %$page) {
322       
323            my $metadata_value = $rdf_doc->{$metadata_name};
324            # convert ns:name to ns.Name
325            $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
326
327            print FOUT "  <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
328        }
329       
330
331        $page_num++;
332
333
334        print FOUT "  </Page>\n";
335        }
336    }
337    else {
338        # simple
339        # duplicate top-level metadata for now plus image to bind to
340
341        my $imgfile = $rdf_doc->{'ex:filename'};
342        if ($imgfile =~ m/(http|ftp):/) {
343        $imgfile = "empty.jpg";
344        }
345        else {
346        $imgfile = &util::filename_cat("..","import",$imgfile);
347        }
348
349
350        my $txt_file = $self->metadata_table_txt_file($output_root,1);
351        $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
352
353        print FOUT "  <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";       
354        foreach my $metadata_name (keys %$rdf_doc) {
355
356        my $metadata_value = $rdf_doc->{$metadata_name};
357       
358        # convert ns:name to ns.Name
359        $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
360
361        print FOUT "  <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
362        }
363        print FOUT "  </Page>\n";
364
365    }
366
367    print FOUT "</PagedDocument>\n";
368    close(FOUT);
369
370    push(@$item_file_list,$output_filename);
371
372    }
373   
374
375    return $item_file_list;
376}
377
378
379
380sub xml_area_convert_file
381{
382    my $self = shift (@_);
383    my ($input_filename, $tmp_dirname, $output_root) = @_;
384
385    eval {
386    # Build up hash table/tree of all records
387
388    my $xslt = $self->{'xslt'};
389    if (defined $xslt && ($xslt ne "")) {
390        # perform xslt
391        my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
392
393        open(TOUT,">/tmp/tout.xml")
394        || die "Unable to open /tmp/tout.xml: $!\n";
395        print TOUT $transformed_xml;
396        close(TOUT);
397
398
399        # feed transformed file (now in memory as string) into XML parser
400        $self->{'parser'}->parse($transformed_xml);
401    }
402    else {
403        $self->{'parser'}->parsefile($input_filename);
404    }
405    };
406 
407    if ($@) {
408
409    # parsefile may either croak somewhere in XML::Parser (e.g. because
410    # the document is not well formed) or die somewhere in XMLPlug or a
411    # derived plugin (e.g. because we're attempting to process a
412    # document whose DOCTYPE is not meant for this plugin). For the
413    # first case we'll print a warning and continue, for the second
414    # we'll just continue quietly
415
416    print STDERR "**** Error is: $@\n";
417
418    my $file = $self->{'file'};
419
420    my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
421    if (defined $msg) {
422        my $outhandle = $self->{'outhandle'};
423        my $plugin_name = ref ($self);
424        print $outhandle "$plugin_name failed to process $file ($msg)\n";
425    }
426
427    my $gli = $self->{'gli'};
428
429    # reset ourself for the next document
430    $self->{'section_level'}=0;
431    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
432    return ("fail",undef); # error during processing
433    }
434
435    my $rdf_desc = $self->{'rdf_desc'};   
436
437#    foreach my $about_key (keys %{$rdf_desc}) {
438#   my $about = $rdf_desc->{$about_key};
439#   foreach my $metadata_name (keys %{$about}) {
440#
441#       my $metadata_value = $about->{$metadata_name};
442##      print STDERR " $metadata_name: $metadata_value\n";
443#   }
444#    }
445
446
447    # Merge entries with same name
448
449
450    my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
451
452#    foreach my $about_key (keys %{$merged_rdf_id}) {
453#   my $about = $merged_rdf_id->{$about_key};
454#   foreach my $metadata_name (keys %{$about}) {
455#
456#       my $metadata_value = $about->{$metadata_name};
457##      print STDERR " $metadata_name: $metadata_value\n";
458#   }
459#    }
460
461
462
463    my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
464                        $output_root);
465
466    return ("item",$item_files);
467}
468
469
470# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
471# to CONTENTdm
472#
473# A better (i.e. in the future) solution would be to see if this can be
474# shifted into gsConvert.pl so there is no need to override the
475# default tmp_area_convert_file()
476
477
478sub tmp_area_convert_file {
479    my $self = shift (@_);
480    my ($output_ext, $input_filename, $textref) = @_;
481
482    # is textref ever used?!?
483
484    my $outhandle = $self->{'outhandle'};
485    my $convert_to = $self->{'convert_to'};
486    my $failhandle = $self->{'failhandle'};
487    my $convert_to_ext = $self->{'convert_to_ext'};
488   
489    # softlink to collection tmp dir
490    my $tmp_dirname
491    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
492    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
493
494    # derive tmp filename from input filename
495    my ($tailname, $dirname, $suffix)
496    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
497
498    # Remove any white space from filename -- no risk of name collision, and
499    # makes later conversion by utils simpler. Leave spaces in path...
500    # tidy up the filename with space, dot, hyphen between
501    $tailname =~ s/\s+//g;
502    $tailname =~ s/\.+//g;
503    $tailname =~ s/\-+//g;
504
505    $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
506    $suffix = lc($suffix);
507    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
508
509    &util::soft_link($input_filename, $tmp_filename);
510    my $verbosity = $self->{'verbosity'};
511    if ($verbosity > 0) {
512    print $outhandle "Converting $tailname$suffix to $convert_to format\n";
513    }
514
515    my $errlog = &util::filename_cat($tmp_dirname, "err.log");
516   
517    # call xml_area_convert_file rather than gsConvert.pl
518
519    my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
520
521    my ($output_type,$item_files)
522    = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
523
524
525    my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
526    my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
527
528    print STDERR "***** No source image identified with item\n";
529
530    print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
531
532    &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
533   
534    # continue as before ...
535
536    # remove symbolic link to original file
537    &util::rm($tmp_filename);
538
539    # Check STDERR here
540    chomp $output_type;
541    if ($output_type eq "fail") {
542    print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
543    print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
544    $self->{'num_not_processed'} ++;
545    if (-s "$errlog") {
546        open(ERRLOG, "$errlog");
547        while (<ERRLOG>) {
548        print $outhandle "$_";
549        }
550        print $outhandle "\n";
551        close ERRLOG;
552    }
553    &util::rm("$errlog") if (-e "$errlog");
554    return [];
555    }
556
557    # store the *actual* output type and return the output filename
558    # it's possible we requested conversion to html, but only to text succeeded
559    #$self->{'convert_to_ext'} = $output_type;
560    if ($output_type =~ /html/i) {
561    $self->{'converted_to'} = "HTML";
562    } elsif ($output_type =~ /te?xt/i) {
563    $self->{'converted_to'} = "Text";
564    } elsif ($output_type =~ /item/i){
565    $self->{'converted_to'} = "PagedImage";
566    }
567
568   
569    return $item_files;
570}
571
572
573
574
575# Override ConvertBinaryFile read
576# Needed so multiple .item files generated are sent down secondary plugin
577
578sub read {
579    my $self = shift (@_);
580    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
581
582
583    $self->{'gli'} = $gli;
584    $self->{'file'} = $file;
585
586    my $successful_rv = -1;
587
588    my $outhandle = $self->{'outhandle'};
589   
590    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
591    return undef unless $self->can_process_this_file($filename_full_path);
592
593    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
594
595    # read() deviates at this point from ConvertBinaryFile
596    # Need to work with list of filename returned
597
598    my $output_ext = $self->{'convert_to_ext'};
599    my $conv_filename_list = [];
600
601    $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
602
603    if (scalar(@$conv_filename_list)==0) {
604    return -1;
605    } # had an error, will be passed down pipeline
606
607    foreach my $conv_filename ( @$conv_filename_list ) {
608    if (! -e "$conv_filename") {return -1;}
609    $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
610    $self->convert_post_process($conv_filename);
611   
612    my $secondary_plugins =  $self->{'secondary_plugins'};
613    my $num_secondary_plugins = scalar(keys %$secondary_plugins);
614
615    if ($num_secondary_plugins == 0) {
616        print $outhandle "Warning: No secondary plugin to use in conversion.  Skipping $file\n";
617        return 0; # effectively block it
618    }
619
620    my @plugin_names = keys %$secondary_plugins;
621    my $plugin_name = shift @plugin_names;
622   
623    if ($num_secondary_plugins > 1) {
624        print $outhandle "Warning: Multiple secondary plugins not supported yet!  Choosing $plugin_name\n.";
625    }
626   
627    my $secondary_plugin = $secondary_plugins->{$plugin_name};
628
629    # note: metadata is not carried on to the next level
630    my ($rv,$doc_obj)
631        = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
632                            $block_hash, $metadata, $processor, $maxdocs, $total_count,
633                            $gli);
634
635    print STDERR "**** $conv_filename => returned rv = $rv\n";
636
637    if ((defined $rv) && ($rv>=0)) {
638        $successful_rv = 1;
639    }
640
641    # Override previous gsdlsourcefilename set by secondary plugin
642    my $collect_file = &util::filename_within_collection($filename_full_path);
643    my $collect_conv_file = &util::filename_within_collection($conv_filename);
644    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
645    $doc_obj->set_converted_filename($collect_conv_file);
646   
647    my ($filemeta) = $file =~ /([^\\\/]+)$/;
648    $self->set_Source_metadata($doc_obj, $filemeta);
649    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
650    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
651   
652    if ($self->{'cover_image'}) {
653        $self->associate_cover_image($doc_obj, $filename_full_path);
654    }
655   
656    # do plugin specific processing of doc_obj
657    unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
658        print STDERR "***** process returned undef: $base_dir $file\n";
659        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
660        return -1;
661    }
662    # do any automatic metadata extraction
663    $self->auto_extract_metadata ($doc_obj);
664
665    # have we found a Title??
666    $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
667
668    # add an OID
669    $self->add_OID($doc_obj);
670    # process the document
671    $processor->process($doc_obj);
672
673    $self->{'num_processed'} ++;
674    }
675
676    return $successful_rv;
677}
678
679sub process {
680
681    return 1;
682}
683
684# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
685sub process_old {
686    my $self = shift (@_);
687    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
688
689   
690    my $secondary_plugins =  $self->{'secondary_plugins'};
691    my @plugin_names = keys %$secondary_plugins;
692    my $plugin_name = shift @plugin_names; # already checked there is only one
693   
694    my $secondary_plugin = $secondary_plugins->{$plugin_name};
695   
696    my $result = $secondary_plugin->process(@_);
697
698    return $result;
699}
700
701
702# Called at the beginning of the XML document.
703sub xml_start_document {
704    my $self = shift(@_);
705    my ($expat) = @_;
706
707    $self->{'rdf_desc'} = {};
708}
709
710
711# Called for DOCTYPE declarations - use die to bail out if this doctype
712# is not meant for this plugin
713sub xml_doctype {
714    my $self = shift(@_);
715    my ($expat, $name, $sysid, $pubid, $internal) = @_;
716
717    die "" if ($name !~ /^rdf:RDF$/);
718
719    my $outhandle = $self->{'outhandle'};
720    print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
721
722}
723
724# Called for every start tag. The $_ variable will contain a copy of the
725# tag and the %_ variable will contain the element's attributes.
726sub xml_start_tag {
727    my $self = shift(@_);
728    my ($expat, $element) = @_;
729
730    if ($element eq "rdf:Description") {
731
732    my $about_key = $_{'about'};
733
734    my $rdf_desc = $self->{'rdf_desc'};
735    $rdf_desc->{$about_key} = {};
736
737    $self->{'about_key'}  = $about_key;
738    $self->{'index_text'} = "";
739    $self->{'pp_text'}    = "<table width=\"100%\">\n";
740
741
742    }
743    elsif (defined $self->{'about_key'}) { 
744    $self->{'metadata_name'} = $element;
745    $self->{'metadata_value'} = "";
746    }
747
748}
749
750# Called for every end tag. The $_ variable will contain a copy of the tag.
751sub xml_end_tag {
752    my $self = shift(@_);
753    my ($expat, $element) = @_;
754
755    if ($element eq "rdf:Description") {
756    $self->{'pp_text'} .= "</table>\n";
757    ## ghtml::htmlsafe($self->{'pp_text'});
758
759
760    my $about_key = $self->{'about_key'};
761    my $about = $self->{'rdf_desc'}->{$about_key};
762    $about->{'IndexText'}     = $self->{'index_text'};
763    $about->{'MetadataTable'} = $self->{'pp_text'};
764
765
766    $self->{'about_key'}   = undef;
767    $self->{'index_text'}  = undef;
768    $self->{'pp_text'}     = undef;
769
770    }
771    elsif (defined $self->{'metadata_name'}) {
772    my $metadata_name = $self->{'metadata_name'};
773    if ($element eq $metadata_name) {
774        my $metadata_value = $self->{'metadata_value'};
775
776        my $about_key = $self->{'about_key'};
777        my $about = $self->{'rdf_desc'}->{$about_key};
778        $about->{$metadata_name} = $metadata_value;
779
780        $self->{'index_text'} .= "$metadata_value\n";
781        $self->{'pp_text'} .= "  <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
782
783        $self->{'metadata_name'}  = undef;
784        $self->{'metadata_value'} = undef;
785    }
786    }
787}
788
789# Called just before start or end tags with accumulated non-markup text in
790# the $_ variable.
791sub xml_text {
792    my $self = shift(@_);
793    my ($expat) = @_;
794
795    if (defined $self->{'metadata_name'}) {
796    $self->{'metadata_value'} .= $_;
797    }
798}
799
800# Called at the end of the XML document.
801sub xml_end_document {
802    my $self = shift(@_);
803    my ($expat) = @_;
804}
805
806
8071;
Note: See TracBrowser for help on using the browser.