Context Navigation

source: gsdl/trunk/perllib/plugins/CONTENTdmPlugin.pm@ 20790

Last change on this file since 20790 was 20790, checked in by kjdon, 15 years ago
set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)
Property svn:keywords set to `Author Date Id Revision`
File size: 21.8 KB

Line
1	###########################################################################
2	#
3	# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package CONTENTdmPlugin;
26
27	use ConvertBinaryFile;
28	use ReadXMLFile;
29
30	use unicode;
31	use ghtml;
32
33	use strict;
34	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37	use XMLParser;
38
39	# inherit ReadXMLFile for the apply_xslt method
40	sub BEGIN {
41	@CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42	}
43
44
45	my $convert_to_list =
46	[ { 'name' => "auto",
47	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
48	{ 'name' => "html",
49	'desc' => "{ConvertBinaryFile.convert_to.html}" },
50	{ 'name' => "text",
51	'desc' => "{ConvertBinaryFile.convert_to.text}" },
52	{ 'name' => "pagedimg",
53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
54	];
55
56
57
58	my $arguments =
59	[
60	{ 'name' => "convert_to",
61	'desc' => "{ConvertBinaryFile.convert_to}",
62	'type' => "enum",
63	'reqd' => "yes",
64	'list' => $convert_to_list,
65	'deft' => "html" },
66	{ 'name' => "xslt",
67	'desc' => "{ReadXMLFile.xslt}",
68	'type' => "string",
69	'deft' => "",
70	'reqd' => "no" },
71	{ 'name' => "process_exp",
72	'desc' => "{BasePlugin.process_exp}",
73	'type' => "regexp",
74	'deft' => &get_default_process_exp(),
75	'reqd' => "no" },
76	{ 'name' => "block_exp",
77	'desc' => "{BasePlugin.block_exp}",
78	'type' => "regexp",
79	'deft' => &get_default_block_exp() }
80	];
81
82	my $options = { 'name' => "CONTENTdmPlugin",
83	'desc' => "{CONTENTdmPlugin.desc}",
84	'abstract' => "no",
85	'inherits' => "yes",
86	# CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
87	'srcreplaceable' => "no",
88	'args' => $arguments };
89
90	sub new {
91	my ($class) = shift (@_);
92	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93	push(@$pluginlist, $class);
94
95	push(@$inputargs,"-title_sub");
96	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
97
98	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99	push(@{$hashArgOptLists->{"OptList"}},$options);
100
101	my @arg_array = @$inputargs;
102	my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
103
104	if ($self->{'info_only'}) {
105	# don't worry about any options etc
106	return bless $self, $class;
107	}
108
109	my $parser = new XML::Parser('Style' => 'Stream',
110	'Pkg' => 'ReadXMLFile',
111	'PluginObj' => $self,
112	'Handlers' => {'Char' => \&ReadXMLFile::Char,
113	'XMLDecl' => \&ReadXMLFile::XMLDecl,
114	'Entity' => \&ReadXMLFile::Entity,
115	'Doctype' => \&ReadXMLFile::Doctype,
116	'Default' => \&ReadXMLFile::Default
117	});
118	$self->{'parser'} = $parser;
119
120
121	$self->{'rdf_desc'} = undef;
122	$self->{'about_key'} = undef;
123	$self->{'metadata_name'} = undef;
124	$self->{'metadata_value'} = undef;
125
126	$self->{'convert_to'} = "PagedImage";
127	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
128
129	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
130	$secondary_plugin_options->{'PagedImagePlugin'} = [];
131	}
132	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
133	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134	push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
135	push(@$pagedimg_options, "-file_rename_method", "none");
136	push(@$pagedimg_options, "-processing_tmp_files");
137	$self = bless $self, $class;
138
139	# ***** no longer needed!
140	# # This needs to be done after blss, to $self passed to XML::Parser
141	# # can correctly resolve the right call-back methods during XML parsing
142
143
144	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145	return $self;
146	}
147
148	sub get_default_process_exp {
149	my $self = shift (@_);
150
151	return q^(?i)\.rdf$^;
152	}
153
154	sub get_default_block_exp {
155	return q^(?i)\.(jpg\|jpeg\|gif)$^;
156	}
157
158
159
160	sub rdf_desc_to_id
161	{
162	my $self = shift (@_);
163	my ($rdf_desc) = @_;
164
165	my $rdf_id = {};
166
167	# initialise any .cpd (=complex multi page) structures
168
169	foreach my $about_key (keys %{$rdf_desc}) {
170	if ($about_key =~ m/\.cpd$/) {
171	my $about = $rdf_desc->{$about_key};
172	my $id = $about->{'dc:identifier'};
173
174	if ($id =~ m/^\s*$/) {
175	# missing id, make one up based on about attribute
176
177	my ($tailname, $dirname, $suffix)
178	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
179
180	$id = "about:$tailname";
181	}
182
183	$rdf_id->{$id} = $about;
184	$rdf_id->{$id}->{'ex:filename'} = $about_key;
185	$rdf_id->{$id}->{'ex:type'} = "complex";
186	$rdf_id->{$id}->{'pages'} = [];
187	}
188
189	}
190
191	# now add in non .cpd items
192
193	foreach my $about_key (keys %{$rdf_desc}) {
194	if ($about_key !~ m/\.cpd$/) {
195	my $about = $rdf_desc->{$about_key};
196	my $id = $about->{'dc:identifier'};
197
198
199	if ($id =~ m/^\s*$/) {
200	# missing id, make one up based on about attribute
201
202	my ($tailname, $dirname, $suffix)
203	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
204
205	$id = "about:$tailname";
206	}
207
208	if (defined $rdf_id->{$id}) {
209	$about->{'ex:filename'} = $about_key;
210
211	# dealing with complex multi-page situation
212	# Add to existing structure
213
214	my $pages = $rdf_id->{$id}->{'pages'};
215	push(@$pages,$about)
216	}
217	else {
218	# New entry
219
220	$rdf_id->{$id} = $about;
221	$rdf_id->{$id}->{'ex:type'} = "simple";
222	$rdf_id->{$id}->{'ex:filename'} = $about_key;
223	}
224	}
225
226	}
227
228	return $rdf_id;
229	}
230
231
232	sub metadata_table_txt_file
233	{
234	my $self = shift (@_);
235	my ($output_root,$page_num) = @_;
236
237	my $txt_filename = $output_root."_page_$page_num.txt";
238
239	my ($tailname, $dirname, $suffix)
240	= &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
241
242	my $txt_file = "$tailname$suffix";
243
244	return $txt_file;
245	}
246
247
248	sub output_metadata_table
249	{
250	my $self = shift (@_);
251	my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
252
253	my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
254
255	open(TOUT,">$txt_filename")
256	\|\| die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
257
258	print TOUT $page->{'MetadataTable'};
259	delete $page->{'MetadataTable'};
260
261	close (TOUT);
262	}
263
264
265	sub rdf_id_to_item_file
266	{
267	my $self = shift (@_);
268	my ($rdf_id,$tmp_dirname,$output_root) = @_;
269
270	my $item_file_list = [];
271
272	foreach my $id (keys %{$rdf_id}) {
273
274	my $id_safe = $id;
275	$id_safe =~ s/ /-/g;
276
277	my $output_filename = $output_root."_$id_safe.item";
278	open(FOUT,">$output_filename")
279	\|\| die "Unable to open $output_filename: $!\n";
280
281
282	print FOUT "<PagedDocument>\n";
283
284	my $rdf_doc = $rdf_id->{$id};
285	foreach my $metadata_name (keys %$rdf_doc) {
286
287
288	next if ($metadata_name eq "pages");
289
290	my $metadata_value = $rdf_doc->{$metadata_name};
291
292	# convert ns:name to ns.Name
293	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
294
295	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
296	}
297
298	if ($rdf_doc->{'ex:type'} eq "complex") {
299	my $pages = $rdf_doc->{'pages'};
300	my $page_num = 1;
301
302	foreach my $page (@$pages) {
303
304	my $imgfile = $page->{'ex:filename'};
305	if ($imgfile =~ m/(http\|ftp):/) {
306	$imgfile = "empty.jpg";
307	}
308	else {
309	$imgfile = &util::filename_cat("..","import",$imgfile);
310	}
311
312	my $txt_file
313	= $self->metadata_table_txt_file($output_root,$page_num);
314
315	$self->output_metadata_table($page,$page_num,
316	$tmp_dirname,$txt_file);
317
318
319	print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
320
321	foreach my $metadata_name (keys %$page) {
322
323	my $metadata_value = $rdf_doc->{$metadata_name};
324	# convert ns:name to ns.Name
325	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
326
327	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
328	}
329
330
331	$page_num++;
332
333
334	print FOUT " </Page>\n";
335	}
336	}
337	else {
338	# simple
339	# duplicate top-level metadata for now plus image to bind to
340
341	my $imgfile = $rdf_doc->{'ex:filename'};
342	if ($imgfile =~ m/(http\|ftp):/) {
343	$imgfile = "empty.jpg";
344	}
345	else {
346	$imgfile = &util::filename_cat("..","import",$imgfile);
347	}
348
349
350	my $txt_file = $self->metadata_table_txt_file($output_root,1);
351	$self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
352
353	print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
354	foreach my $metadata_name (keys %$rdf_doc) {
355
356	my $metadata_value = $rdf_doc->{$metadata_name};
357
358	# convert ns:name to ns.Name
359	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
360
361	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
362	}
363	print FOUT " </Page>\n";
364
365	}
366
367	print FOUT "</PagedDocument>\n";
368	close(FOUT);
369
370	push(@$item_file_list,$output_filename);
371
372	}
373
374
375	return $item_file_list;
376	}
377
378
379
380	sub xml_area_convert_file
381	{
382	my $self = shift (@_);
383	my ($input_filename, $tmp_dirname, $output_root) = @_;
384
385	eval {
386	# Build up hash table/tree of all records
387
388	my $xslt = $self->{'xslt'};
389	if (defined $xslt && ($xslt ne "")) {
390	# perform xslt
391	my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
392
393	open(TOUT,">/tmp/tout.xml")
394	\|\| die "Unable to open /tmp/tout.xml: $!\n";
395	print TOUT $transformed_xml;
396	close(TOUT);
397
398
399	# feed transformed file (now in memory as string) into XML parser
400	$self->{'parser'}->parse($transformed_xml);
401	}
402	else {
403	$self->{'parser'}->parsefile($input_filename);
404	}
405	};
406
407	if ($@) {
408
409	# parsefile may either croak somewhere in XML::Parser (e.g. because
410	# the document is not well formed) or die somewhere in XMLPlug or a
411	# derived plugin (e.g. because we're attempting to process a
412	# document whose DOCTYPE is not meant for this plugin). For the
413	# first case we'll print a warning and continue, for the second
414	# we'll just continue quietly
415
416	print STDERR "**** Error is: $@\n";
417
418	my $file = $self->{'file'};
419
420	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
421	if (defined $msg) {
422	my $outhandle = $self->{'outhandle'};
423	my $plugin_name = ref ($self);
424	print $outhandle "$plugin_name failed to process $file ($msg)\n";
425	}
426
427	my $gli = $self->{'gli'};
428
429	# reset ourself for the next document
430	$self->{'section_level'}=0;
431	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
432	return ("fail",undef); # error during processing
433	}
434
435	my $rdf_desc = $self->{'rdf_desc'};
436
437	# foreach my $about_key (keys %{$rdf_desc}) {
438	# my $about = $rdf_desc->{$about_key};
439	# foreach my $metadata_name (keys %{$about}) {
440	#
441	# my $metadata_value = $about->{$metadata_name};
442	## print STDERR " $metadata_name: $metadata_value\n";
443	# }
444	# }
445
446
447	# Merge entries with same name
448
449
450	my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
451
452	# foreach my $about_key (keys %{$merged_rdf_id}) {
453	# my $about = $merged_rdf_id->{$about_key};
454	# foreach my $metadata_name (keys %{$about}) {
455	#
456	# my $metadata_value = $about->{$metadata_name};
457	## print STDERR " $metadata_name: $metadata_value\n";
458	# }
459	# }
460
461
462
463	my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
464	$output_root);
465
466	return ("item",$item_files);
467	}
468
469
470	# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
471	# to CONTENTdm
472	#
473	# A better (i.e. in the future) solution would be to see if this can be
474	# shifted into gsConvert.pl so there is no need to override the
475	# default tmp_area_convert_file()
476
477
478	sub tmp_area_convert_file {
479	my $self = shift (@_);
480	my ($output_ext, $input_filename, $textref) = @_;
481
482	# is textref ever used?!?
483
484	my $outhandle = $self->{'outhandle'};
485	my $convert_to = $self->{'convert_to'};
486	my $failhandle = $self->{'failhandle'};
487	my $convert_to_ext = $self->{'convert_to_ext'};
488
489	# softlink to collection tmp dir
490	my $tmp_dirname
491	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
492	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
493
494	# derive tmp filename from input filename
495	my ($tailname, $dirname, $suffix)
496	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
497
498	# Remove any white space from filename -- no risk of name collision, and
499	# makes later conversion by utils simpler. Leave spaces in path...
500	# tidy up the filename with space, dot, hyphen between
501	$tailname =~ s/\s+//g;
502	$tailname =~ s/\.+//g;
503	$tailname =~ s/\-+//g;
504
505	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
506	$suffix = lc($suffix);
507	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
508
509	&util::soft_link($input_filename, $tmp_filename);
510	my $verbosity = $self->{'verbosity'};
511	if ($verbosity > 0) {
512	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
513	}
514
515	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
516
517	# call xml_area_convert_file rather than gsConvert.pl
518
519	my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
520
521	my ($output_type,$item_files)
522	= $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
523
524
525	my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
526	my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
527
528	print STDERR "***** No source image identified with item\n";
529
530	print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
531
532	&util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
533
534	# continue as before ...
535
536	# remove symbolic link to original file
537	&util::rm($tmp_filename);
538
539	# Check STDERR here
540	chomp $output_type;
541	if ($output_type eq "fail") {
542	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
543	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
544	$self->{'num_not_processed'} ++;
545	if (-s "$errlog") {
546	open(ERRLOG, "$errlog");
547	while (<ERRLOG>) {
548	print $outhandle "$_";
549	}
550	print $outhandle "\n";
551	close ERRLOG;
552	}
553	&util::rm("$errlog") if (-e "$errlog");
554	return [];
555	}
556
557	# store the actual output type and return the output filename
558	# it's possible we requested conversion to html, but only to text succeeded
559	#$self->{'convert_to_ext'} = $output_type;
560	if ($output_type =~ /html/i) {
561	$self->{'converted_to'} = "HTML";
562	} elsif ($output_type =~ /te?xt/i) {
563	$self->{'converted_to'} = "Text";
564	} elsif ($output_type =~ /item/i){
565	$self->{'converted_to'} = "PagedImage";
566	}
567
568
569	return $item_files;
570	}
571
572
573
574
575	# Override ConvertBinaryFile read
576	# Needed so multiple .item files generated are sent down secondary plugin
577
578	sub read {
579	my $self = shift (@_);
580	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
581
582
583	$self->{'gli'} = $gli;
584	$self->{'file'} = $file;
585
586	my $successful_rv = -1;
587
588	my $outhandle = $self->{'outhandle'};
589
590	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
591	return undef unless $self->can_process_this_file($filename_full_path);
592
593	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
594
595	# read() deviates at this point from ConvertBinaryFile
596	# Need to work with list of filename returned
597
598	my $output_ext = $self->{'convert_to_ext'};
599	my $conv_filename_list = [];
600
601	$conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
602
603	if (scalar(@$conv_filename_list)==0) {
604	return -1;
605	} # had an error, will be passed down pipeline
606
607	foreach my $conv_filename ( @$conv_filename_list ) {
608	if (! -e "$conv_filename") {return -1;}
609	$self->{'conv_filename'} = $conv_filename; # is this used anywhere?
610	$self->convert_post_process($conv_filename);
611
612	my $secondary_plugins = $self->{'secondary_plugins'};
613	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
614
615	if ($num_secondary_plugins == 0) {
616	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
617	return 0; # effectively block it
618	}
619
620	my @plugin_names = keys %$secondary_plugins;
621	my $plugin_name = shift @plugin_names;
622
623	if ($num_secondary_plugins > 1) {
624	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
625	}
626
627	my $secondary_plugin = $secondary_plugins->{$plugin_name};
628
629	# note: metadata is not carried on to the next level
630	my ($rv,$doc_obj)
631	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
632	$block_hash, $metadata, $processor, $maxdocs, $total_count,
633	$gli);
634
635	print STDERR "**** $conv_filename => returned rv = $rv\n";
636
637	if ((defined $rv) && ($rv>=0)) {
638	$successful_rv = 1;
639	}
640
641	# Override previous gsdlsourcefilename set by secondary plugin
642	my $collect_file = &util::filename_within_collection($filename_full_path);
643	my $collect_conv_file = &util::filename_within_collection($conv_filename);
644	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
645	$doc_obj->set_converted_filename($collect_conv_file);
646
647	my ($filemeta) = $file =~ /([^\\\/]+)$/;
648	$self->set_Source_metadata($doc_obj, $filemeta);
649	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
650	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
651
652	if ($self->{'cover_image'}) {
653	$self->associate_cover_image($doc_obj, $filename_full_path);
654	}
655
656	# do plugin specific processing of doc_obj
657	unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
658	print STDERR "***** process returned undef: $base_dir $file\n";
659	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
660	return -1;
661	}
662	# do any automatic metadata extraction
663	$self->auto_extract_metadata ($doc_obj);
664
665	# have we found a Title??
666	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
667
668	# add an OID
669	$self->add_OID($doc_obj);
670	# process the document
671	$processor->process($doc_obj);
672
673	$self->{'num_processed'} ++;
674	}
675
676	return $successful_rv;
677	}
678
679	sub process {
680
681	return 1;
682	}
683
684	# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
685	sub process_old {
686	my $self = shift (@_);
687	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
688
689
690	my $secondary_plugins = $self->{'secondary_plugins'};
691	my @plugin_names = keys %$secondary_plugins;
692	my $plugin_name = shift @plugin_names; # already checked there is only one
693
694	my $secondary_plugin = $secondary_plugins->{$plugin_name};
695
696	my $result = $secondary_plugin->process(@_);
697
698	return $result;
699	}
700
701
702	# Called at the beginning of the XML document.
703	sub xml_start_document {
704	my $self = shift(@_);
705	my ($expat) = @_;
706
707	$self->{'rdf_desc'} = {};
708	}
709
710
711	# Called for DOCTYPE declarations - use die to bail out if this doctype
712	# is not meant for this plugin
713	sub xml_doctype {
714	my $self = shift(@_);
715	my ($expat, $name, $sysid, $pubid, $internal) = @_;
716
717	die "" if ($name !~ /^rdf:RDF$/);
718
719	my $outhandle = $self->{'outhandle'};
720	print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
721
722	}
723
724	# Called for every start tag. The $_ variable will contain a copy of the
725	# tag and the %_ variable will contain the element's attributes.
726	sub xml_start_tag {
727	my $self = shift(@_);
728	my ($expat, $element) = @_;
729
730	if ($element eq "rdf:Description") {
731
732	my $about_key = $_{'about'};
733
734	my $rdf_desc = $self->{'rdf_desc'};
735	$rdf_desc->{$about_key} = {};
736
737	$self->{'about_key'} = $about_key;
738	$self->{'index_text'} = "";
739	$self->{'pp_text'} = "<table width=\"100%\">\n";
740
741
742	}
743	elsif (defined $self->{'about_key'}) {
744	$self->{'metadata_name'} = $element;
745	$self->{'metadata_value'} = "";
746	}
747
748	}
749
750	# Called for every end tag. The $_ variable will contain a copy of the tag.
751	sub xml_end_tag {
752	my $self = shift(@_);
753	my ($expat, $element) = @_;
754
755	if ($element eq "rdf:Description") {
756	$self->{'pp_text'} .= "</table>\n";
757	## ghtml::htmlsafe($self->{'pp_text'});
758
759
760	my $about_key = $self->{'about_key'};
761	my $about = $self->{'rdf_desc'}->{$about_key};
762	$about->{'IndexText'} = $self->{'index_text'};
763	$about->{'MetadataTable'} = $self->{'pp_text'};
764
765
766	$self->{'about_key'} = undef;
767	$self->{'index_text'} = undef;
768	$self->{'pp_text'} = undef;
769
770	}
771	elsif (defined $self->{'metadata_name'}) {
772	my $metadata_name = $self->{'metadata_name'};
773	if ($element eq $metadata_name) {
774	my $metadata_value = $self->{'metadata_value'};
775
776	my $about_key = $self->{'about_key'};
777	my $about = $self->{'rdf_desc'}->{$about_key};
778	$about->{$metadata_name} = $metadata_value;
779
780	$self->{'index_text'} .= "$metadata_value\n";
781	$self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
782
783	$self->{'metadata_name'} = undef;
784	$self->{'metadata_value'} = undef;
785	}
786	}
787	}
788
789	# Called just before start or end tags with accumulated non-markup text in
790	# the $_ variable.
791	sub xml_text {
792	my $self = shift(@_);
793	my ($expat) = @_;
794
795	if (defined $self->{'metadata_name'}) {
796	$self->{'metadata_value'} .= $_;
797	}
798	}
799
800	# Called at the end of the XML document.
801	sub xml_end_document {
802	my $self = shift(@_);
803	my ($expat) = @_;
804	}
805
806
807	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: