source: main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 22.8 KB
Line 
1###########################################################################
2#
3# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package CONTENTdmPlugin;
26
27use ConvertBinaryFile;
28use ReadXMLFile;
29
30use unicode;
31use ghtml;
32
33use strict;
34no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37use XMLParser;
38
39# inherit ReadXMLFile for the apply_xslt method
40sub BEGIN {
41 @CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42}
43
44
45my $convert_to_list =
46 [
47# { 'name' => "auto",
48# 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49# { 'name' => "html",
50# 'desc' => "{ConvertBinaryFile.convert_to.html}" },
51# { 'name' => "text",
52# 'desc' => "{ConvertBinaryFile.convert_to.text}" },
53 { 'name' => "pagedimg",
54 'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
55 ];
56
57
58
59my $arguments =
60 [
61 { 'name' => "convert_to",
62 'desc' => "{ConvertBinaryFile.convert_to}",
63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
66 'deft' => "html" },
67 { 'name' => "xslt",
68 'desc' => "{ReadXMLFile.xslt}",
69 'type' => "string",
70 'deft' => "",
71 'reqd' => "no" },
72 { 'name' => "process_exp",
73 'desc' => "{BaseImporter.process_exp}",
74 'type' => "regexp",
75 'deft' => &get_default_process_exp(),
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BaseImporter.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp() }
81];
82
83my $options = { 'name' => "CONTENTdmPlugin",
84 'desc' => "{CONTENTdmPlugin.desc}",
85 'abstract' => "no",
86 'inherits' => "yes",
87 # CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
88 'srcreplaceable' => "no",
89 'args' => $arguments };
90
91sub new {
92 my ($class) = shift (@_);
93 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$pluginlist, $class);
95
96 push(@$inputargs,"-title_sub");
97 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
98
99 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
100 push(@{$hashArgOptLists->{"OptList"}},$options);
101
102 my @arg_array = @$inputargs;
103 my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
104
105 if ($self->{'info_only'}) {
106 # don't worry about any options etc
107 return bless $self, $class;
108 }
109
110 my $parser = new XML::Parser('Style' => 'Stream',
111 'Pkg' => 'ReadXMLFile',
112 'PluginObj' => $self,
113 'Handlers' => {'Char' => \&ReadXMLFile::Char,
114 'XMLDecl' => \&ReadXMLFile::XMLDecl,
115 'Entity' => \&ReadXMLFile::Entity,
116 'Doctype' => \&ReadXMLFile::Doctype,
117 'Default' => \&ReadXMLFile::Default
118 });
119 $self->{'parser'} = $parser;
120
121
122 $self->{'rdf_desc'} = undef;
123 $self->{'about_key'} = undef;
124 $self->{'metadata_name'} = undef;
125 $self->{'metadata_value'} = undef;
126
127 # do we only allow one option??
128 $self->{'convert_to'} = "pagedimg";
129 $self->{'convert_to_plugin'} = "PagedImagePlugin";
130 $self->{'convert_to_ext'} = "jpg";
131
132 my $secondary_plugin_name = $self->{'convert_to_plugin'};
133 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
134
135 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
136 $secondary_plugin_options->{$secondary_plugin_name} = [];
137 }
138 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
139
140 push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
141 push(@$specific_options, "-create_thumbnail", "true", "-create_screenview", "true");
142 push(@$specific_options, "-file_rename_method", "none");
143 push(@$specific_options, "-processing_tmp_files");
144
145# my $secondary_plugin_options = $self->{'secondary_plugin_options'};
146
147# if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
148# $secondary_plugin_options->{'PagedImagePlugin'} = [];
149# }
150# my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
151# push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
152# push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
153# push(@$pagedimg_options, "-file_rename_method", "none");
154# push(@$pagedimg_options, "-processing_tmp_files");
155 $self = bless $self, $class;
156 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
157 return $self;
158}
159
160sub get_default_process_exp {
161 my $self = shift (@_);
162
163 return q^(?i)\.rdf$^;
164}
165
166sub get_default_block_exp {
167 return q^(?i)\.(jpg|jpeg|gif)$^;
168}
169
170
171
172sub rdf_desc_to_id
173{
174 my $self = shift (@_);
175 my ($rdf_desc) = @_;
176
177 my $rdf_id = {};
178
179 # initialise any .cpd (=complex multi page) structures
180
181 foreach my $about_key (keys %{$rdf_desc}) {
182 if ($about_key =~ m/\.cpd$/) {
183 my $about = $rdf_desc->{$about_key};
184 my $id = $about->{'dc:identifier'};
185
186 if ($id =~ m/^\s*$/) {
187 # missing id, make one up based on about attribute
188
189 my ($tailname, $dirname, $suffix)
190 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
191
192 $id = "about:$tailname";
193 }
194
195 $rdf_id->{$id} = $about;
196 $rdf_id->{$id}->{'ex:filename'} = $about_key;
197 $rdf_id->{$id}->{'ex:type'} = "complex";
198 $rdf_id->{$id}->{'pages'} = [];
199 }
200
201 }
202
203 # now add in *non* .cpd items
204
205 foreach my $about_key (keys %{$rdf_desc}) {
206 if ($about_key !~ m/\.cpd$/) {
207 my $about = $rdf_desc->{$about_key};
208 my $id = $about->{'dc:identifier'};
209
210
211 if ($id =~ m/^\s*$/) {
212 # missing id, make one up based on about attribute
213
214 my ($tailname, $dirname, $suffix)
215 = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
216
217 $id = "about:$tailname";
218 }
219
220 if (defined $rdf_id->{$id}) {
221 $about->{'ex:filename'} = $about_key;
222
223 # dealing with complex multi-page situation
224 # Add to existing structure
225
226 my $pages = $rdf_id->{$id}->{'pages'};
227 push(@$pages,$about)
228 }
229 else {
230 # New entry
231
232 $rdf_id->{$id} = $about;
233 $rdf_id->{$id}->{'ex:type'} = "simple";
234 $rdf_id->{$id}->{'ex:filename'} = $about_key;
235 }
236 }
237
238 }
239
240 return $rdf_id;
241}
242
243
244sub metadata_table_txt_file
245{
246 my $self = shift (@_);
247 my ($output_root,$page_num) = @_;
248
249 my $txt_filename = $output_root."_page_$page_num.txt";
250
251 my ($tailname, $dirname, $suffix)
252 = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
253
254 my $txt_file = "$tailname$suffix";
255
256 return $txt_file;
257}
258
259
260sub output_metadata_table
261{
262 my $self = shift (@_);
263 my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
264
265 my $txt_filename = &FileUtils::filenameConcatenate($tmp_dirname,$txt_file);
266
267 open(TOUT,">$txt_filename")
268 || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
269
270 print TOUT $page->{'MetadataTable'};
271 delete $page->{'MetadataTable'};
272
273 close (TOUT);
274}
275
276
277sub rdf_id_to_item_file
278{
279 my $self = shift (@_);
280 my ($rdf_id,$tmp_dirname,$output_root) = @_;
281
282 my $item_file_list = [];
283
284 foreach my $id (keys %{$rdf_id}) {
285
286 my $id_safe = $id;
287 $id_safe =~ s/ /-/g;
288
289 my $output_filename = $output_root."_$id_safe.item";
290 open(FOUT,">$output_filename")
291 || die "Unable to open $output_filename: $!\n";
292
293
294 print FOUT "<PagedDocument>\n";
295
296 my $rdf_doc = $rdf_id->{$id};
297 foreach my $metadata_name (keys %$rdf_doc) {
298
299
300 next if ($metadata_name eq "pages");
301
302 my $metadata_value = $rdf_doc->{$metadata_name};
303
304 # convert ns:name to ns.Name
305 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
306
307 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
308 }
309
310 if ($rdf_doc->{'ex:type'} eq "complex") {
311 my $pages = $rdf_doc->{'pages'};
312 my $page_num = 1;
313
314 foreach my $page (@$pages) {
315
316 my $imgfile = $page->{'ex:filename'};
317 if ($imgfile =~ m/(http|ftp):/) {
318 $imgfile = "empty.jpg";
319 }
320 else {
321 $imgfile = &FileUtils::filenameConcatenate("..","import",$imgfile);
322 }
323
324 my $txt_file
325 = $self->metadata_table_txt_file($output_root,$page_num);
326
327 $self->output_metadata_table($page,$page_num,
328 $tmp_dirname,$txt_file);
329
330
331 print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
332
333 foreach my $metadata_name (keys %$page) {
334
335 my $metadata_value = $rdf_doc->{$metadata_name};
336 # convert ns:name to ns.Name
337 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
338
339 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
340 }
341
342
343 $page_num++;
344
345
346 print FOUT " </Page>\n";
347 }
348 }
349 else {
350 # simple
351 # duplicate top-level metadata for now plus image to bind to
352
353 my $imgfile = $rdf_doc->{'ex:filename'};
354 if ($imgfile =~ m/(http|ftp):/) {
355 $imgfile = "empty.jpg";
356 }
357 else {
358 $imgfile = &FileUtils::filenameConcatenate("..","import",$imgfile);
359 }
360
361
362 my $txt_file = $self->metadata_table_txt_file($output_root,1);
363 $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
364
365 print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
366 foreach my $metadata_name (keys %$rdf_doc) {
367
368 my $metadata_value = $rdf_doc->{$metadata_name};
369
370 # convert ns:name to ns.Name
371 $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/;
372
373 print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
374 }
375 print FOUT " </Page>\n";
376
377 }
378
379 print FOUT "</PagedDocument>\n";
380 close(FOUT);
381
382 push(@$item_file_list,$output_filename);
383
384 }
385
386
387 return $item_file_list;
388}
389
390
391
392sub xml_area_convert_file
393{
394 my $self = shift (@_);
395 my ($input_filename, $tmp_dirname, $output_root) = @_;
396
397 eval {
398 # Build up hash table/tree of all records
399
400 my $xslt = $self->{'xslt'};
401 if (defined $xslt && ($xslt ne "")) {
402 # perform xslt
403 my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
404
405 open(TOUT,">/tmp/tout.xml")
406 || die "Unable to open /tmp/tout.xml: $!\n";
407 print TOUT $transformed_xml;
408 close(TOUT);
409
410
411 # feed transformed file (now in memory as string) into XML parser
412 $self->{'parser'}->parse($transformed_xml);
413 }
414 else {
415 $self->{'parser'}->parsefile($input_filename);
416 }
417 };
418
419 if ($@) {
420
421 # parsefile may either croak somewhere in XML::Parser (e.g. because
422 # the document is not well formed) or die somewhere in XMLPlug or a
423 # derived plugin (e.g. because we're attempting to process a
424 # document whose DOCTYPE is not meant for this plugin). For the
425 # first case we'll print a warning and continue, for the second
426 # we'll just continue quietly
427
428 print STDERR "**** Error is: $@\n";
429
430 my $file = $self->{'file'};
431
432 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
433 if (defined $msg) {
434 my $outhandle = $self->{'outhandle'};
435 my $plugin_name = ref ($self);
436 print $outhandle "$plugin_name failed to process $file ($msg)\n";
437 }
438
439 my $gli = $self->{'gli'};
440
441 # reset ourself for the next document
442 $self->{'section_level'}=0;
443 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
444 return ("fail",undef); # error during processing
445 }
446
447 my $rdf_desc = $self->{'rdf_desc'};
448
449# foreach my $about_key (keys %{$rdf_desc}) {
450# my $about = $rdf_desc->{$about_key};
451# foreach my $metadata_name (keys %{$about}) {
452#
453# my $metadata_value = $about->{$metadata_name};
454## print STDERR " $metadata_name: $metadata_value\n";
455# }
456# }
457
458
459 # Merge entries with same name
460
461
462 my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
463
464# foreach my $about_key (keys %{$merged_rdf_id}) {
465# my $about = $merged_rdf_id->{$about_key};
466# foreach my $metadata_name (keys %{$about}) {
467#
468# my $metadata_value = $about->{$metadata_name};
469## print STDERR " $metadata_name: $metadata_value\n";
470# }
471# }
472
473
474
475 my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
476 $output_root);
477
478 return ("item",$item_files);
479}
480
481
482# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
483# to CONTENTdm
484#
485# A better (i.e. in the future) solution would be to see if this can be
486# shifted into gsConvert.pl so there is no need to override the
487# default tmp_area_convert_file()
488
489
490sub tmp_area_convert_file {
491 my $self = shift (@_);
492 my ($output_ext, $input_filename, $textref) = @_;
493
494 # is textref ever used?!?
495
496 my $outhandle = $self->{'outhandle'};
497 my $convert_to = $self->{'convert_to'};
498 my $failhandle = $self->{'failhandle'};
499 my $convert_to_ext = $self->{'convert_to_ext'};
500
501 # softlink to collection tmp dir
502 my $tmp_dirname
503 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
504 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
505
506 # derive tmp filename from input filename
507 my ($tailname, $dirname, $suffix)
508 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
509
510 # Remove any white space from filename -- no risk of name collision, and
511 # makes later conversion by utils simpler. Leave spaces in path...
512 # tidy up the filename with space, dot, hyphen between
513 $tailname =~ s/\s+//g;
514 $tailname =~ s/\.+//g;
515 $tailname =~ s/\-+//g;
516
517 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
518 $suffix = lc($suffix);
519 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
520
521 &FileUtils::softLink($input_filename, $tmp_filename);
522 my $verbosity = $self->{'verbosity'};
523 if ($verbosity > 0) {
524 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
525 }
526
527 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
528
529 # call xml_area_convert_file rather than gsConvert.pl
530
531 my $output_root = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname");
532
533 my ($output_type,$item_files)
534 = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
535
536
537 my $fakeimg_filename = &FileUtils::filenameConcatenate($dirname, "empty.jpg");
538 my $fakeimg_tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "empty.jpg");
539
540 print STDERR "***** No source image identified with item\n";
541
542 print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
543
544 &FileUtils::softLink($fakeimg_filename, $fakeimg_tmp_filename);
545
546 # continue as before ...
547
548 # remove symbolic link to original file
549 &FileUtils::removeFiles($tmp_filename);
550
551 # Check STDERR here
552 chomp $output_type;
553 if ($output_type eq "fail") {
554 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
555 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
556 $self->{'num_not_processed'} ++;
557 if (-s "$errlog") {
558 open(ERRLOG, "$errlog");
559 while (<ERRLOG>) {
560 print $outhandle "$_";
561 }
562 print $outhandle "\n";
563 close ERRLOG;
564 }
565 &FileUtils::removeFiles("$errlog") if (-e "$errlog");
566 return [];
567 }
568
569 # store the *actual* output type and return the output filename
570 # it's possible we requested conversion to html, but only to text succeeded
571 #$self->{'convert_to_ext'} = $output_type;
572 if ($output_type =~ /html/i) {
573 $self->{'converted_to'} = "HTML";
574 } elsif ($output_type =~ /te?xt/i) {
575 $self->{'converted_to'} = "Text";
576 } elsif ($output_type =~ /item/i){
577 $self->{'converted_to'} = "PagedImage";
578 }
579
580
581 return $item_files;
582}
583
584
585
586
587# Override ConvertBinaryFile (ie BaseImporter) read
588# Needed so multiple .item files generated are sent down secondary plugin
589# and the resulting doc_objs all processed.
590
591sub read {
592 my $self = shift (@_);
593 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
594
595
596 $self->{'gli'} = $gli;
597 $self->{'file'} = $file;
598
599 my $successful_rv = -1;
600
601 my $outhandle = $self->{'outhandle'};
602
603 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
604 return undef unless $self->can_process_this_file($filename_full_path);
605
606 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
607
608 # read() deviates at this point from ConvertBinaryFile
609 # Need to work with list of filename returned
610
611 my $output_ext = $self->{'convert_to_ext'};
612 my $conv_filename_list = [];
613
614 $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
615
616 if (scalar(@$conv_filename_list)==0) {
617 return -1;
618 } # had an error, will be passed down pipeline
619
620 foreach my $conv_filename ( @$conv_filename_list ) {
621 if (! -e "$conv_filename") {return -1;}
622 $self->{'conv_filename'} = $conv_filename; # is this used anywhere?
623 $self->convert_post_process($conv_filename);
624
625 my $secondary_plugins = $self->{'secondary_plugins'};
626 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
627
628 if ($num_secondary_plugins == 0) {
629 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
630 return 0; # effectively block it
631 }
632
633 my @plugin_names = keys %$secondary_plugins;
634 my $plugin_name = shift @plugin_names;
635
636 if ($num_secondary_plugins > 1) {
637 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
638 }
639
640 my $secondary_plugin = $secondary_plugins->{$plugin_name};
641
642 # note: metadata is not carried on to the next level
643 my ($rv,$doc_obj)
644 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
645 $block_hash, $metadata, $processor, $maxdocs, $total_count,
646 $gli);
647
648 print STDERR "**** $conv_filename => returned rv = $rv\n";
649
650 if ((defined $rv) && ($rv>=0)) {
651 $successful_rv = 1;
652 }
653
654 # Override previous gsdlsourcefilename set by secondary plugin
655 my $collect_file = &util::filename_within_collection($filename_full_path);
656 my $collect_conv_file = &util::filename_within_collection($conv_filename);
657 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
658 $doc_obj->set_converted_filename($collect_conv_file);
659
660 my ($filemeta) = $file =~ /([^\\\/]+)$/;
661 my $plugin_filename_encoding = $self->{'filename_encoding'};
662 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
663 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
664 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
665 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
666
667 if ($self->{'cover_image'}) {
668 $self->associate_cover_image($doc_obj, $filename_full_path);
669 }
670
671 # do plugin specific processing of doc_obj
672 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
673 print STDERR "***** process returned undef: $base_dir $file\n";
674 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
675 return -1;
676 }
677 # do any automatic metadata extraction
678 $self->auto_extract_metadata ($doc_obj);
679
680 # have we found a Title??
681 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
682
683 # add an OID
684 $self->add_OID($doc_obj);
685 # process the document
686 $processor->process($doc_obj);
687
688 $self->{'num_processed'} ++;
689 }
690
691 return $successful_rv;
692}
693
694sub process {
695
696 return 1;
697}
698
699# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
700sub process_old {
701 my $self = shift (@_);
702 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
703
704
705 my $secondary_plugins = $self->{'secondary_plugins'};
706 my @plugin_names = keys %$secondary_plugins;
707 my $plugin_name = shift @plugin_names; # already checked there is only one
708
709 my $secondary_plugin = $secondary_plugins->{$plugin_name};
710
711 my $result = $secondary_plugin->process(@_);
712
713 return $result;
714}
715
716
717# Called at the beginning of the XML document.
718sub xml_start_document {
719 my $self = shift(@_);
720 my ($expat) = @_;
721
722 $self->{'rdf_desc'} = {};
723}
724
725
726# Called for DOCTYPE declarations - use die to bail out if this doctype
727# is not meant for this plugin
728sub xml_doctype {
729 my $self = shift(@_);
730 my ($expat, $name, $sysid, $pubid, $internal) = @_;
731
732 die "" if ($name !~ /^rdf:RDF$/);
733
734 my $outhandle = $self->{'outhandle'};
735 print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
736
737}
738
739# Called for every start tag. The $_ variable will contain a copy of the
740# tag and the %_ variable will contain the element's attributes.
741sub xml_start_tag {
742 my $self = shift(@_);
743 my ($expat, $element) = @_;
744
745 if ($element eq "rdf:Description") {
746
747 my $about_key = $_{'about'};
748
749 my $rdf_desc = $self->{'rdf_desc'};
750 $rdf_desc->{$about_key} = {};
751
752 $self->{'about_key'} = $about_key;
753 $self->{'index_text'} = "";
754 $self->{'pp_text'} = "<table width=\"100%\">\n";
755
756
757 }
758 elsif (defined $self->{'about_key'}) {
759 $self->{'metadata_name'} = $element;
760 $self->{'metadata_value'} = "";
761 }
762
763}
764
765# Called for every end tag. The $_ variable will contain a copy of the tag.
766sub xml_end_tag {
767 my $self = shift(@_);
768 my ($expat, $element) = @_;
769
770 if ($element eq "rdf:Description") {
771 $self->{'pp_text'} .= "</table>\n";
772 ## ghtml::htmlsafe($self->{'pp_text'});
773
774
775 my $about_key = $self->{'about_key'};
776 my $about = $self->{'rdf_desc'}->{$about_key};
777 $about->{'IndexText'} = $self->{'index_text'};
778 $about->{'MetadataTable'} = $self->{'pp_text'};
779
780
781 $self->{'about_key'} = undef;
782 $self->{'index_text'} = undef;
783 $self->{'pp_text'} = undef;
784
785 }
786 elsif (defined $self->{'metadata_name'}) {
787 my $metadata_name = $self->{'metadata_name'};
788 if ($element eq $metadata_name) {
789 my $metadata_value = $self->{'metadata_value'};
790
791 my $about_key = $self->{'about_key'};
792 my $about = $self->{'rdf_desc'}->{$about_key};
793 $about->{$metadata_name} = $metadata_value;
794
795 $self->{'index_text'} .= "$metadata_value\n";
796 $self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
797
798 $self->{'metadata_name'} = undef;
799 $self->{'metadata_value'} = undef;
800 }
801 }
802}
803
804# Called just before start or end tags with accumulated non-markup text in
805# the $_ variable.
806sub xml_text {
807 my $self = shift(@_);
808 my ($expat) = @_;
809
810 if (defined $self->{'metadata_name'}) {
811 $self->{'metadata_value'} .= $_;
812 }
813}
814
815# Called at the end of the XML document.
816sub xml_end_document {
817 my $self = shift(@_);
818 my ($expat) = @_;
819}
820
821
8221;
Note: See TracBrowser for help on using the repository browser.