Context Navigation

source: gsdl/trunk/perllib/plugins/CONTENTdmPlugin.pm@ 19811

Last change on this file since 19811 was 19052, checked in by davidb, 15 years ago
Modifications to CONENTdmPlugin so its works with Kathy's new plugin restructuring. Still considered experimental.
Property svn:keywords set to `Author Date Id Revision`
File size: 21.8 KB

Line
1	###########################################################################
2	#
3	# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package CONTENTdmPlugin;
26
27	use ConvertBinaryFile;
28	use ReadXMLFile;
29
30	use unicode;
31	use ghtml;
32
33	use strict;
34	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37	use XMLParser;
38
39	# inherit ReadXMLFile for the apply_xslt method
40	sub BEGIN {
41	@CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42	}
43
44
45	my $convert_to_list =
46	[ { 'name' => "auto",
47	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
48	{ 'name' => "html",
49	'desc' => "{ConvertBinaryFile.convert_to.html}" },
50	{ 'name' => "text",
51	'desc' => "{ConvertBinaryFile.convert_to.text}" },
52	{ 'name' => "pagedimg",
53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
54	];
55
56
57
58	my $arguments =
59	[
60	{ 'name' => "convert_to",
61	'desc' => "{ConvertBinaryFile.convert_to}",
62	'type' => "enum",
63	'reqd' => "yes",
64	'list' => $convert_to_list,
65	'deft' => "html" },
66	{ 'name' => "xslt",
67	'desc' => "{ReadXMLFile.xslt}",
68	'type' => "string",
69	'deft' => "",
70	'reqd' => "no" },
71	{ 'name' => "process_exp",
72	'desc' => "{BasePlugin.process_exp}",
73	'type' => "regexp",
74	'deft' => &get_default_process_exp(),
75	'reqd' => "no" },
76	{ 'name' => "block_exp",
77	'desc' => "{BasePlugin.block_exp}",
78	'type' => "regexp",
79	'deft' => &get_default_block_exp() }
80	];
81
82	my $options = { 'name' => "CONTENTdmPlugin",
83	'desc' => "{CONTENTdmPlugin.desc}",
84	'abstract' => "no",
85	'inherits' => "yes",
86	# CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
87	'srcreplaceable' => "no",
88	'args' => $arguments };
89
90	sub new {
91	my ($class) = shift (@_);
92	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93	push(@$pluginlist, $class);
94
95	push(@$inputargs,"-title_sub");
96	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
97
98	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99	push(@{$hashArgOptLists->{"OptList"}},$options);
100
101	my @arg_array = @$inputargs;
102	my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
103
104	if ($self->{'info_only'}) {
105	# don't worry about any options etc
106	return bless $self, $class;
107	}
108
109	my $parser = new XML::Parser('Style' => 'Stream',
110	'Pkg' => 'ReadXMLFile',
111	'PluginObj' => $self,
112	'Handlers' => {'Char' => \&ReadXMLFile::Char,
113	'XMLDecl' => \&ReadXMLFile::XMLDecl,
114	'Entity' => \&ReadXMLFile::Entity,
115	'Doctype' => \&ReadXMLFile::Doctype,
116	'Default' => \&ReadXMLFile::Default
117	});
118	$self->{'parser'} = $parser;
119
120
121	$self->{'rdf_desc'} = undef;
122	$self->{'about_key'} = undef;
123	$self->{'metadata_name'} = undef;
124	$self->{'metadata_value'} = undef;
125
126	$self->{'convert_to'} = "PagedImage";
127	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
128
129	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
130	$secondary_plugin_options->{'PagedImagePlugin'} = [];
131	}
132	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
133	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134	push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
135	push(@$pagedimg_options, "-file_rename_method", "none");
136
137	$self = bless $self, $class;
138
139	# ***** no longer needed!
140	# # This needs to be done after blss, to $self passed to XML::Parser
141	# # can correctly resolve the right call-back methods during XML parsing
142
143
144	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145	return $self;
146	}
147
148	sub get_default_process_exp {
149	my $self = shift (@_);
150
151	return q^(?i)\.rdf$^;
152	}
153
154	sub get_default_block_exp {
155	return q^(?i)\.(jpg\|jpeg\|gif)$^;
156	}
157
158
159
160	sub rdf_desc_to_id
161	{
162	my $self = shift (@_);
163	my ($rdf_desc) = @_;
164
165	my $rdf_id = {};
166
167	# initialise any .cpd (=complex multi page) structures
168
169	foreach my $about_key (keys %{$rdf_desc}) {
170	if ($about_key =~ m/\.cpd$/) {
171	my $about = $rdf_desc->{$about_key};
172	my $id = $about->{'dc:identifier'};
173
174	if ($id =~ m/^\s*$/) {
175	# missing id, make one up based on about attribute
176
177	my ($tailname, $dirname, $suffix)
178	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
179
180	$id = "about:$tailname";
181	}
182
183	$rdf_id->{$id} = $about;
184	$rdf_id->{$id}->{'ex:filename'} = $about_key;
185	$rdf_id->{$id}->{'ex:type'} = "complex";
186	$rdf_id->{$id}->{'pages'} = [];
187	}
188
189	}
190
191	# now add in non .cpd items
192
193	foreach my $about_key (keys %{$rdf_desc}) {
194	if ($about_key !~ m/\.cpd$/) {
195	my $about = $rdf_desc->{$about_key};
196	my $id = $about->{'dc:identifier'};
197
198
199	if ($id =~ m/^\s*$/) {
200	# missing id, make one up based on about attribute
201
202	my ($tailname, $dirname, $suffix)
203	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
204
205	$id = "about:$tailname";
206	}
207
208	if (defined $rdf_id->{$id}) {
209	$about->{'ex:filename'} = $about_key;
210
211	# dealing with complex multi-page situation
212	# Add to existing structure
213
214	my $pages = $rdf_id->{$id}->{'pages'};
215	push(@$pages,$about)
216	}
217	else {
218	# New entry
219
220	$rdf_id->{$id} = $about;
221	$rdf_id->{$id}->{'ex:type'} = "simple";
222	$rdf_id->{$id}->{'ex:filename'} = $about_key;
223	}
224	}
225
226	}
227
228	return $rdf_id;
229	}
230
231
232	sub metadata_table_txt_file
233	{
234	my $self = shift (@_);
235	my ($output_root,$page_num) = @_;
236
237	my $txt_filename = $output_root."_page_$page_num.txt";
238
239	my ($tailname, $dirname, $suffix)
240	= &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
241
242	my $txt_file = "$tailname$suffix";
243
244	return $txt_file;
245	}
246
247
248	sub output_metadata_table
249	{
250	my $self = shift (@_);
251	my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
252
253	my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
254
255	open(TOUT,">$txt_filename")
256	\|\| die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
257
258	print TOUT $page->{'MetadataTable'};
259	delete $page->{'MetadataTable'};
260
261	close (TOUT);
262	}
263
264
265	sub rdf_id_to_item_file
266	{
267	my $self = shift (@_);
268	my ($rdf_id,$tmp_dirname,$output_root) = @_;
269
270	my $item_file_list = [];
271
272	foreach my $id (keys %{$rdf_id}) {
273
274	my $id_safe = $id;
275	$id_safe =~ s/ /-/g;
276
277	my $output_filename = $output_root."_$id_safe.item";
278	open(FOUT,">$output_filename")
279	\|\| die "Unable to open $output_filename: $!\n";
280
281
282	print FOUT "<PagedDocument>\n";
283
284	my $rdf_doc = $rdf_id->{$id};
285	foreach my $metadata_name (keys %$rdf_doc) {
286
287
288	next if ($metadata_name eq "pages");
289
290	my $metadata_value = $rdf_doc->{$metadata_name};
291
292	# convert ns:name to ns.Name
293	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
294
295	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
296	}
297
298	if ($rdf_doc->{'ex:type'} eq "complex") {
299	my $pages = $rdf_doc->{'pages'};
300	my $page_num = 1;
301
302	foreach my $page (@$pages) {
303
304	my $imgfile = $page->{'ex:filename'};
305	if ($imgfile =~ m/(http\|ftp):/) {
306	$imgfile = "empty.jpg";
307	}
308	else {
309	$imgfile = &util::filename_cat("..","import",$imgfile);
310	}
311
312	my $txt_file
313	= $self->metadata_table_txt_file($output_root,$page_num);
314
315	$self->output_metadata_table($page,$page_num,
316	$tmp_dirname,$txt_file);
317
318
319	print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
320
321	foreach my $metadata_name (keys %$page) {
322
323	my $metadata_value = $rdf_doc->{$metadata_name};
324	# convert ns:name to ns.Name
325	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
326
327	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
328	}
329
330
331	$page_num++;
332
333
334	print FOUT " </Page>\n";
335	}
336	}
337	else {
338	# simple
339	# duplicate top-level metadata for now plus image to bind to
340
341	my $imgfile = $rdf_doc->{'ex:filename'};
342	if ($imgfile =~ m/(http\|ftp):/) {
343	$imgfile = "empty.jpg";
344	}
345	else {
346	$imgfile = &util::filename_cat("..","import",$imgfile);
347	$imgfile = "../import/".$imgfile;
348	}
349
350
351	my $txt_file = $self->metadata_table_txt_file($output_root,1);
352	$self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
353
354	print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
355	foreach my $metadata_name (keys %$rdf_doc) {
356
357	my $metadata_value = $rdf_doc->{$metadata_name};
358
359	# convert ns:name to ns.Name
360	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
361
362	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
363	}
364	print FOUT " </Page>\n";
365
366	}
367
368	print FOUT "</PagedDocument>\n";
369	close(FOUT);
370
371	push(@$item_file_list,$output_filename);
372
373	}
374
375
376	return $item_file_list;
377	}
378
379
380
381	sub xml_area_convert_file
382	{
383	my $self = shift (@_);
384	my ($input_filename, $tmp_dirname, $output_root) = @_;
385
386	eval {
387	# Build up hash table/tree of all records
388
389	my $xslt = $self->{'xslt'};
390	if (defined $xslt && ($xslt ne "")) {
391	# perform xslt
392	my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
393
394	open(TOUT,">/tmp/tout.xml")
395	\|\| die "Unable to open /tmp/tout.xml: $!\n";
396	print TOUT $transformed_xml;
397	close(TOUT);
398
399
400	# feed transformed file (now in memory as string) into XML parser
401	$self->{'parser'}->parse($transformed_xml);
402	}
403	else {
404	$self->{'parser'}->parsefile($input_filename);
405	}
406	};
407
408	if ($@) {
409
410	# parsefile may either croak somewhere in XML::Parser (e.g. because
411	# the document is not well formed) or die somewhere in XMLPlug or a
412	# derived plugin (e.g. because we're attempting to process a
413	# document whose DOCTYPE is not meant for this plugin). For the
414	# first case we'll print a warning and continue, for the second
415	# we'll just continue quietly
416
417	print STDERR "**** Error is: $@\n";
418
419	my $file = $self->{'file'};
420
421	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
422	if (defined $msg) {
423	my $outhandle = $self->{'outhandle'};
424	my $plugin_name = ref ($self);
425	print $outhandle "$plugin_name failed to process $file ($msg)\n";
426	}
427
428	my $gli = $self->{'gli'};
429
430	# reset ourself for the next document
431	$self->{'section_level'}=0;
432	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
433	return ("fail",undef); # error during processing
434	}
435
436	my $rdf_desc = $self->{'rdf_desc'};
437
438	# foreach my $about_key (keys %{$rdf_desc}) {
439	# my $about = $rdf_desc->{$about_key};
440	# foreach my $metadata_name (keys %{$about}) {
441	#
442	# my $metadata_value = $about->{$metadata_name};
443	## print STDERR " $metadata_name: $metadata_value\n";
444	# }
445	# }
446
447
448	# Merge entries with same name
449
450
451	my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
452
453	# foreach my $about_key (keys %{$merged_rdf_id}) {
454	# my $about = $merged_rdf_id->{$about_key};
455	# foreach my $metadata_name (keys %{$about}) {
456	#
457	# my $metadata_value = $about->{$metadata_name};
458	## print STDERR " $metadata_name: $metadata_value\n";
459	# }
460	# }
461
462
463
464	my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
465	$output_root);
466
467	return ("item",$item_files);
468	}
469
470
471	# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
472	# to CONTENTdm
473	#
474	# A better (i.e. in the future) solution would be to see if this can be
475	# shifted into gsConvert.pl so there is no need to override the
476	# default tmp_area_convert_file()
477
478
479	sub tmp_area_convert_file {
480	my $self = shift (@_);
481	my ($output_ext, $input_filename, $textref) = @_;
482
483	# is textref ever used?!?
484
485	my $outhandle = $self->{'outhandle'};
486	my $convert_to = $self->{'convert_to'};
487	my $failhandle = $self->{'failhandle'};
488	my $convert_to_ext = $self->{'convert_to_ext'};
489
490	# softlink to collection tmp dir
491	my $tmp_dirname
492	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
493	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
494
495	# derive tmp filename from input filename
496	my ($tailname, $dirname, $suffix)
497	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
498
499	# Remove any white space from filename -- no risk of name collision, and
500	# makes later conversion by utils simpler. Leave spaces in path...
501	# tidy up the filename with space, dot, hyphen between
502	$tailname =~ s/\s+//g;
503	$tailname =~ s/\.+//g;
504	$tailname =~ s/\-+//g;
505
506	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
507	$suffix = lc($suffix);
508	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
509
510	&util::soft_link($input_filename, $tmp_filename);
511	my $verbosity = $self->{'verbosity'};
512	if ($verbosity > 0) {
513	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
514	}
515
516	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
517
518	# call xml_area_convert_file rather than gsConvert.pl
519
520	my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
521
522	my ($output_type,$item_files)
523	= $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
524
525
526	my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
527	my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
528
529	print STDERR "***** No source image identified with item\n";
530
531	print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
532
533	&util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
534
535	# continue as before ...
536
537	# remove symbolic link to original file
538	&util::rm($tmp_filename);
539
540	# Check STDERR here
541	chomp $output_type;
542	if ($output_type eq "fail") {
543	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
544	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
545	$self->{'num_not_processed'} ++;
546	if (-s "$errlog") {
547	open(ERRLOG, "$errlog");
548	while (<ERRLOG>) {
549	print $outhandle "$_";
550	}
551	print $outhandle "\n";
552	close ERRLOG;
553	}
554	&util::rm("$errlog") if (-e "$errlog");
555	return [];
556	}
557
558	# store the actual output type and return the output filename
559	# it's possible we requested conversion to html, but only to text succeeded
560	#$self->{'convert_to_ext'} = $output_type;
561	if ($output_type =~ /html/i) {
562	$self->{'converted_to'} = "HTML";
563	} elsif ($output_type =~ /te?xt/i) {
564	$self->{'converted_to'} = "Text";
565	} elsif ($output_type =~ /item/i){
566	$self->{'converted_to'} = "PagedImage";
567	}
568
569
570	return $item_files;
571	}
572
573
574
575
576	# Override ConvertBinaryFile read
577	# Needed so multiple .item files generated are sent down secondary plugin
578
579	sub read {
580	my $self = shift (@_);
581	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
582
583
584	$self->{'gli'} = $gli;
585	$self->{'file'} = $file;
586
587	my $successful_rv = -1;
588
589	my $outhandle = $self->{'outhandle'};
590
591	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
592	return undef unless $self->can_process_this_file($filename_full_path);
593
594	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
595
596	# read() deviates at this point from ConvertBinaryFile
597	# Need to work with list of filename returned
598
599	my $output_ext = $self->{'convert_to_ext'};
600	my $conv_filename_list = [];
601
602	$conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
603
604	if (scalar(@$conv_filename_list)==0) {
605	return -1;
606	} # had an error, will be passed down pipeline
607
608	foreach my $conv_filename ( @$conv_filename_list ) {
609	if (! -e "$conv_filename") {return -1;}
610	$self->{'conv_filename'} = $conv_filename; # is this used anywhere?
611	$self->convert_post_process($conv_filename);
612
613	my $secondary_plugins = $self->{'secondary_plugins'};
614	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
615
616	if ($num_secondary_plugins == 0) {
617	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
618	return 0; # effectively block it
619	}
620
621	my @plugin_names = keys %$secondary_plugins;
622	my $plugin_name = shift @plugin_names;
623
624	if ($num_secondary_plugins > 1) {
625	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
626	}
627
628	my $secondary_plugin = $secondary_plugins->{$plugin_name};
629
630	# note: metadata is not carried on to the next level
631	my ($rv,$doc_obj)
632	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
633	$block_hash, $metadata, $processor, $maxdocs, $total_count,
634	$gli);
635
636	print STDERR "**** $conv_filename => returned rv = $rv\n";
637
638	if ((defined $rv) && ($rv>=0)) {
639	$successful_rv = 1;
640	}
641
642	# Override previous gsdlsourcefilename set by secondary plugin
643	my $collect_file = &util::filename_within_collection($filename_full_path);
644	my $collect_conv_file = &util::filename_within_collection($conv_filename);
645	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
646	$doc_obj->set_converted_filename($collect_conv_file);
647
648	my ($filemeta) = $file =~ /([^\\\/]+)$/;
649	$self->set_Source_metadata($doc_obj, $filemeta);
650	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
651	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
652
653	if ($self->{'cover_image'}) {
654	$self->associate_cover_image($doc_obj, $filename_full_path);
655	}
656
657	# do plugin specific processing of doc_obj
658	unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
659	print STDERR "***** process returned undef: $base_dir $file\n";
660	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
661	return -1;
662	}
663	# do any automatic metadata extraction
664	$self->auto_extract_metadata ($doc_obj);
665
666	# have we found a Title??
667	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
668
669	# add an OID
670	$self->add_OID($doc_obj);
671	# process the document
672	$processor->process($doc_obj);
673
674	$self->{'num_processed'} ++;
675	}
676
677	return $successful_rv;
678	}
679
680	sub process {
681
682	return 1;
683	}
684
685	# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
686	sub process_old {
687	my $self = shift (@_);
688	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
689
690
691	my $secondary_plugins = $self->{'secondary_plugins'};
692	my @plugin_names = keys %$secondary_plugins;
693	my $plugin_name = shift @plugin_names; # already checked there is only one
694
695	my $secondary_plugin = $secondary_plugins->{$plugin_name};
696
697	my $result = $secondary_plugin->process(@_);
698
699	return $result;
700	}
701
702
703	# Called at the beginning of the XML document.
704	sub xml_start_document {
705	my $self = shift(@_);
706	my ($expat) = @_;
707
708	$self->{'rdf_desc'} = {};
709	}
710
711
712	# Called for DOCTYPE declarations - use die to bail out if this doctype
713	# is not meant for this plugin
714	sub xml_doctype {
715	my $self = shift(@_);
716	my ($expat, $name, $sysid, $pubid, $internal) = @_;
717
718	die "" if ($name !~ /^rdf:RDF$/);
719
720	my $outhandle = $self->{'outhandle'};
721	print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
722
723	}
724
725	# Called for every start tag. The $_ variable will contain a copy of the
726	# tag and the %_ variable will contain the element's attributes.
727	sub xml_start_tag {
728	my $self = shift(@_);
729	my ($expat, $element) = @_;
730
731	if ($element eq "rdf:Description") {
732
733	my $about_key = $_{'about'};
734
735	my $rdf_desc = $self->{'rdf_desc'};
736	$rdf_desc->{$about_key} = {};
737
738	$self->{'about_key'} = $about_key;
739	$self->{'index_text'} = "";
740	$self->{'pp_text'} = "<table width=\"100%\">\n";
741
742
743	}
744	elsif (defined $self->{'about_key'}) {
745	$self->{'metadata_name'} = $element;
746	$self->{'metadata_value'} = "";
747	}
748
749	}
750
751	# Called for every end tag. The $_ variable will contain a copy of the tag.
752	sub xml_end_tag {
753	my $self = shift(@_);
754	my ($expat, $element) = @_;
755
756	if ($element eq "rdf:Description") {
757	$self->{'pp_text'} .= "</table>\n";
758	## ghtml::htmlsafe($self->{'pp_text'});
759
760
761	my $about_key = $self->{'about_key'};
762	my $about = $self->{'rdf_desc'}->{$about_key};
763	$about->{'IndexText'} = $self->{'index_text'};
764	$about->{'MetadataTable'} = $self->{'pp_text'};
765
766
767	$self->{'about_key'} = undef;
768	$self->{'index_text'} = undef;
769	$self->{'pp_text'} = undef;
770
771	}
772	elsif (defined $self->{'metadata_name'}) {
773	my $metadata_name = $self->{'metadata_name'};
774	if ($element eq $metadata_name) {
775	my $metadata_value = $self->{'metadata_value'};
776
777	my $about_key = $self->{'about_key'};
778	my $about = $self->{'rdf_desc'}->{$about_key};
779	$about->{$metadata_name} = $metadata_value;
780
781	$self->{'index_text'} .= "$metadata_value\n";
782	$self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
783
784	$self->{'metadata_name'} = undef;
785	$self->{'metadata_value'} = undef;
786	}
787	}
788	}
789
790	# Called just before start or end tags with accumulated non-markup text in
791	# the $_ variable.
792	sub xml_text {
793	my $self = shift(@_);
794	my ($expat) = @_;
795
796	if (defined $self->{'metadata_name'}) {
797	$self->{'metadata_value'} .= $_;
798	}
799	}
800
801	# Called at the end of the XML document.
802	sub xml_end_document {
803	my $self = shift(@_);
804	my ($expat) = @_;
805	}
806
807
808	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: