Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm@ 22597

Last change on this file since 22597 was 22597, checked in by kjdon, 14 years ago
code tidy up. rearranged how convertbinaryfile plugins set up their secondary plugins - now only set up the options for the one they are using. all subclass specific code moved out of convertbinaryfile.new into the appropriate plugin file.
Property svn:keywords set to `Author Date Id Revision`
File size: 22.4 KB

Line
1	###########################################################################
2	#
3	# CONTENTdmPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package CONTENTdmPlugin;
26
27	use ConvertBinaryFile;
28	use ReadXMLFile;
29
30	use unicode;
31	use ghtml;
32
33	use strict;
34	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
35
36
37	use XMLParser;
38
39	# inherit ReadXMLFile for the apply_xslt method
40	sub BEGIN {
41	@CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile');
42	}
43
44
45	my $convert_to_list =
46	[
47	# { 'name' => "auto",
48	# 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49	# { 'name' => "html",
50	# 'desc' => "{ConvertBinaryFile.convert_to.html}" },
51	# { 'name' => "text",
52	# 'desc' => "{ConvertBinaryFile.convert_to.text}" },
53	{ 'name' => "pagedimg",
54	'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
55	];
56
57
58
59	my $arguments =
60	[
61	{ 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "html" },
67	{ 'name' => "xslt",
68	'desc' => "{ReadXMLFile.xslt}",
69	'type' => "string",
70	'deft' => "",
71	'reqd' => "no" },
72	{ 'name' => "process_exp",
73	'desc' => "{BasePlugin.process_exp}",
74	'type' => "regexp",
75	'deft' => &get_default_process_exp(),
76	'reqd' => "no" },
77	{ 'name' => "block_exp",
78	'desc' => "{BasePlugin.block_exp}",
79	'type' => "regexp",
80	'deft' => &get_default_block_exp() }
81	];
82
83	my $options = { 'name' => "CONTENTdmPlugin",
84	'desc' => "{CONTENTdmPlugin.desc}",
85	'abstract' => "no",
86	'inherits' => "yes",
87	# CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html
88	'srcreplaceable' => "no",
89	'args' => $arguments };
90
91	sub new {
92	my ($class) = shift (@_);
93	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94	push(@$pluginlist, $class);
95
96	push(@$inputargs,"-title_sub");
97	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
98
99	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
100	push(@{$hashArgOptLists->{"OptList"}},$options);
101
102	my @arg_array = @$inputargs;
103	my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists);
104
105	if ($self->{'info_only'}) {
106	# don't worry about any options etc
107	return bless $self, $class;
108	}
109
110	my $parser = new XML::Parser('Style' => 'Stream',
111	'Pkg' => 'ReadXMLFile',
112	'PluginObj' => $self,
113	'Handlers' => {'Char' => \&ReadXMLFile::Char,
114	'XMLDecl' => \&ReadXMLFile::XMLDecl,
115	'Entity' => \&ReadXMLFile::Entity,
116	'Doctype' => \&ReadXMLFile::Doctype,
117	'Default' => \&ReadXMLFile::Default
118	});
119	$self->{'parser'} = $parser;
120
121
122	$self->{'rdf_desc'} = undef;
123	$self->{'about_key'} = undef;
124	$self->{'metadata_name'} = undef;
125	$self->{'metadata_value'} = undef;
126
127	# do we only allow one option??
128	$self->{'convert_to'} = "pagedimg";
129	$self->{'convert_to_plugin'} = "PagedImagePlugin";
130	$self->{'convert_to_ext'} = "jpg";
131
132	my $secondary_plugin_name = $self->{'convert_to_plugin'};
133	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
134
135	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
136	$secondary_plugin_options->{$secondary_plugin_name} = [];
137	}
138	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
139
140	push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
141	push(@$specific_options, "-create_thumbnail", "true", "-create_screenview", "true");
142	push(@$specific_options, "-file_rename_method", "none");
143	push(@$specific_options, "-processing_tmp_files");
144
145	# my $secondary_plugin_options = $self->{'secondary_plugin_options'};
146
147	# if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
148	# $secondary_plugin_options->{'PagedImagePlugin'} = [];
149	# }
150	# my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
151	# push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
152	# push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
153	# push(@$pagedimg_options, "-file_rename_method", "none");
154	# push(@$pagedimg_options, "-processing_tmp_files");
155	$self = bless $self, $class;
156	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
157	return $self;
158	}
159
160	sub get_default_process_exp {
161	my $self = shift (@_);
162
163	return q^(?i)\.rdf$^;
164	}
165
166	sub get_default_block_exp {
167	return q^(?i)\.(jpg\|jpeg\|gif)$^;
168	}
169
170
171
172	sub rdf_desc_to_id
173	{
174	my $self = shift (@_);
175	my ($rdf_desc) = @_;
176
177	my $rdf_id = {};
178
179	# initialise any .cpd (=complex multi page) structures
180
181	foreach my $about_key (keys %{$rdf_desc}) {
182	if ($about_key =~ m/\.cpd$/) {
183	my $about = $rdf_desc->{$about_key};
184	my $id = $about->{'dc:identifier'};
185
186	if ($id =~ m/^\s*$/) {
187	# missing id, make one up based on about attribute
188
189	my ($tailname, $dirname, $suffix)
190	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
191
192	$id = "about:$tailname";
193	}
194
195	$rdf_id->{$id} = $about;
196	$rdf_id->{$id}->{'ex:filename'} = $about_key;
197	$rdf_id->{$id}->{'ex:type'} = "complex";
198	$rdf_id->{$id}->{'pages'} = [];
199	}
200
201	}
202
203	# now add in non .cpd items
204
205	foreach my $about_key (keys %{$rdf_desc}) {
206	if ($about_key !~ m/\.cpd$/) {
207	my $about = $rdf_desc->{$about_key};
208	my $id = $about->{'dc:identifier'};
209
210
211	if ($id =~ m/^\s*$/) {
212	# missing id, make one up based on about attribute
213
214	my ($tailname, $dirname, $suffix)
215	= &File::Basename::fileparse($about_key, "\\.[^\\.]+\$");
216
217	$id = "about:$tailname";
218	}
219
220	if (defined $rdf_id->{$id}) {
221	$about->{'ex:filename'} = $about_key;
222
223	# dealing with complex multi-page situation
224	# Add to existing structure
225
226	my $pages = $rdf_id->{$id}->{'pages'};
227	push(@$pages,$about)
228	}
229	else {
230	# New entry
231
232	$rdf_id->{$id} = $about;
233	$rdf_id->{$id}->{'ex:type'} = "simple";
234	$rdf_id->{$id}->{'ex:filename'} = $about_key;
235	}
236	}
237
238	}
239
240	return $rdf_id;
241	}
242
243
244	sub metadata_table_txt_file
245	{
246	my $self = shift (@_);
247	my ($output_root,$page_num) = @_;
248
249	my $txt_filename = $output_root."_page_$page_num.txt";
250
251	my ($tailname, $dirname, $suffix)
252	= &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$");
253
254	my $txt_file = "$tailname$suffix";
255
256	return $txt_file;
257	}
258
259
260	sub output_metadata_table
261	{
262	my $self = shift (@_);
263	my ($page,$page_num,$tmp_dirname,$txt_file) = @_;
264
265	my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file);
266
267	open(TOUT,">$txt_filename")
268	\|\| die "Error: unable to write metadata data out as txt file $txt_filename: $!\n";
269
270	print TOUT $page->{'MetadataTable'};
271	delete $page->{'MetadataTable'};
272
273	close (TOUT);
274	}
275
276
277	sub rdf_id_to_item_file
278	{
279	my $self = shift (@_);
280	my ($rdf_id,$tmp_dirname,$output_root) = @_;
281
282	my $item_file_list = [];
283
284	foreach my $id (keys %{$rdf_id}) {
285
286	my $id_safe = $id;
287	$id_safe =~ s/ /-/g;
288
289	my $output_filename = $output_root."_$id_safe.item";
290	open(FOUT,">$output_filename")
291	\|\| die "Unable to open $output_filename: $!\n";
292
293
294	print FOUT "<PagedDocument>\n";
295
296	my $rdf_doc = $rdf_id->{$id};
297	foreach my $metadata_name (keys %$rdf_doc) {
298
299
300	next if ($metadata_name eq "pages");
301
302	my $metadata_value = $rdf_doc->{$metadata_name};
303
304	# convert ns:name to ns.Name
305	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
306
307	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
308	}
309
310	if ($rdf_doc->{'ex:type'} eq "complex") {
311	my $pages = $rdf_doc->{'pages'};
312	my $page_num = 1;
313
314	foreach my $page (@$pages) {
315
316	my $imgfile = $page->{'ex:filename'};
317	if ($imgfile =~ m/(http\|ftp):/) {
318	$imgfile = "empty.jpg";
319	}
320	else {
321	$imgfile = &util::filename_cat("..","import",$imgfile);
322	}
323
324	my $txt_file
325	= $self->metadata_table_txt_file($output_root,$page_num);
326
327	$self->output_metadata_table($page,$page_num,
328	$tmp_dirname,$txt_file);
329
330
331	print FOUT " <Page pagenum=\"$page_num\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
332
333	foreach my $metadata_name (keys %$page) {
334
335	my $metadata_value = $rdf_doc->{$metadata_name};
336	# convert ns:name to ns.Name
337	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
338
339	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
340	}
341
342
343	$page_num++;
344
345
346	print FOUT " </Page>\n";
347	}
348	}
349	else {
350	# simple
351	# duplicate top-level metadata for now plus image to bind to
352
353	my $imgfile = $rdf_doc->{'ex:filename'};
354	if ($imgfile =~ m/(http\|ftp):/) {
355	$imgfile = "empty.jpg";
356	}
357	else {
358	$imgfile = &util::filename_cat("..","import",$imgfile);
359	}
360
361
362	my $txt_file = $self->metadata_table_txt_file($output_root,1);
363	$self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file);
364
365	print FOUT " <Page pagenum=\"1\" imgfile=\"$imgfile\" txtfile=\"$txt_file\">\n";
366	foreach my $metadata_name (keys %$rdf_doc) {
367
368	my $metadata_value = $rdf_doc->{$metadata_name};
369
370	# convert ns:name to ns.Name
371	$metadata_name =~ s/^(.?):(.)/$1\.\u$2/;
372
373	print FOUT " <Metadata name=\"$metadata_name\">$metadata_value</Metadata>\n";
374	}
375	print FOUT " </Page>\n";
376
377	}
378
379	print FOUT "</PagedDocument>\n";
380	close(FOUT);
381
382	push(@$item_file_list,$output_filename);
383
384	}
385
386
387	return $item_file_list;
388	}
389
390
391
392	sub xml_area_convert_file
393	{
394	my $self = shift (@_);
395	my ($input_filename, $tmp_dirname, $output_root) = @_;
396
397	eval {
398	# Build up hash table/tree of all records
399
400	my $xslt = $self->{'xslt'};
401	if (defined $xslt && ($xslt ne "")) {
402	# perform xslt
403	my $transformed_xml = $self->apply_xslt($xslt,$input_filename);
404
405	open(TOUT,">/tmp/tout.xml")
406	\|\| die "Unable to open /tmp/tout.xml: $!\n";
407	print TOUT $transformed_xml;
408	close(TOUT);
409
410
411	# feed transformed file (now in memory as string) into XML parser
412	$self->{'parser'}->parse($transformed_xml);
413	}
414	else {
415	$self->{'parser'}->parsefile($input_filename);
416	}
417	};
418
419	if ($@) {
420
421	# parsefile may either croak somewhere in XML::Parser (e.g. because
422	# the document is not well formed) or die somewhere in XMLPlug or a
423	# derived plugin (e.g. because we're attempting to process a
424	# document whose DOCTYPE is not meant for this plugin). For the
425	# first case we'll print a warning and continue, for the second
426	# we'll just continue quietly
427
428	print STDERR "**** Error is: $@\n";
429
430	my $file = $self->{'file'};
431
432	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
433	if (defined $msg) {
434	my $outhandle = $self->{'outhandle'};
435	my $plugin_name = ref ($self);
436	print $outhandle "$plugin_name failed to process $file ($msg)\n";
437	}
438
439	my $gli = $self->{'gli'};
440
441	# reset ourself for the next document
442	$self->{'section_level'}=0;
443	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
444	return ("fail",undef); # error during processing
445	}
446
447	my $rdf_desc = $self->{'rdf_desc'};
448
449	# foreach my $about_key (keys %{$rdf_desc}) {
450	# my $about = $rdf_desc->{$about_key};
451	# foreach my $metadata_name (keys %{$about}) {
452	#
453	# my $metadata_value = $about->{$metadata_name};
454	## print STDERR " $metadata_name: $metadata_value\n";
455	# }
456	# }
457
458
459	# Merge entries with same name
460
461
462	my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc);
463
464	# foreach my $about_key (keys %{$merged_rdf_id}) {
465	# my $about = $merged_rdf_id->{$about_key};
466	# foreach my $metadata_name (keys %{$about}) {
467	#
468	# my $metadata_value = $about->{$metadata_name};
469	## print STDERR " $metadata_name: $metadata_value\n";
470	# }
471	# }
472
473
474
475	my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname,
476	$output_root);
477
478	return ("item",$item_files);
479	}
480
481
482	# Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific
483	# to CONTENTdm
484	#
485	# A better (i.e. in the future) solution would be to see if this can be
486	# shifted into gsConvert.pl so there is no need to override the
487	# default tmp_area_convert_file()
488
489
490	sub tmp_area_convert_file {
491	my $self = shift (@_);
492	my ($output_ext, $input_filename, $textref) = @_;
493
494	# is textref ever used?!?
495
496	my $outhandle = $self->{'outhandle'};
497	my $convert_to = $self->{'convert_to'};
498	my $failhandle = $self->{'failhandle'};
499	my $convert_to_ext = $self->{'convert_to_ext'};
500
501	# softlink to collection tmp dir
502	my $tmp_dirname
503	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
504	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
505
506	# derive tmp filename from input filename
507	my ($tailname, $dirname, $suffix)
508	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
509
510	# Remove any white space from filename -- no risk of name collision, and
511	# makes later conversion by utils simpler. Leave spaces in path...
512	# tidy up the filename with space, dot, hyphen between
513	$tailname =~ s/\s+//g;
514	$tailname =~ s/\.+//g;
515	$tailname =~ s/\-+//g;
516
517	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
518	$suffix = lc($suffix);
519	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
520
521	&util::soft_link($input_filename, $tmp_filename);
522	my $verbosity = $self->{'verbosity'};
523	if ($verbosity > 0) {
524	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
525	}
526
527	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
528
529	# call xml_area_convert_file rather than gsConvert.pl
530
531	my $output_root = &util::filename_cat($tmp_dirname, "$tailname");
532
533	my ($output_type,$item_files)
534	= $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root);
535
536
537	my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg");
538	my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg");
539
540	print STDERR "***** No source image identified with item\n";
541
542	print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n";
543
544	&util::soft_link($fakeimg_filename, $fakeimg_tmp_filename);
545
546	# continue as before ...
547
548	# remove symbolic link to original file
549	&util::rm($tmp_filename);
550
551	# Check STDERR here
552	chomp $output_type;
553	if ($output_type eq "fail") {
554	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
555	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
556	$self->{'num_not_processed'} ++;
557	if (-s "$errlog") {
558	open(ERRLOG, "$errlog");
559	while (<ERRLOG>) {
560	print $outhandle "$_";
561	}
562	print $outhandle "\n";
563	close ERRLOG;
564	}
565	&util::rm("$errlog") if (-e "$errlog");
566	return [];
567	}
568
569	# store the actual output type and return the output filename
570	# it's possible we requested conversion to html, but only to text succeeded
571	#$self->{'convert_to_ext'} = $output_type;
572	if ($output_type =~ /html/i) {
573	$self->{'converted_to'} = "HTML";
574	} elsif ($output_type =~ /te?xt/i) {
575	$self->{'converted_to'} = "Text";
576	} elsif ($output_type =~ /item/i){
577	$self->{'converted_to'} = "PagedImage";
578	}
579
580
581	return $item_files;
582	}
583
584
585
586
587	# Override ConvertBinaryFile read
588	# Needed so multiple .item files generated are sent down secondary plugin
589
590	sub read {
591	my $self = shift (@_);
592	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
593
594
595	$self->{'gli'} = $gli;
596	$self->{'file'} = $file;
597
598	my $successful_rv = -1;
599
600	my $outhandle = $self->{'outhandle'};
601
602	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
603	return undef unless $self->can_process_this_file($filename_full_path);
604
605	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
606
607	# read() deviates at this point from ConvertBinaryFile
608	# Need to work with list of filename returned
609
610	my $output_ext = $self->{'convert_to_ext'};
611	my $conv_filename_list = [];
612
613	$conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path);
614
615	if (scalar(@$conv_filename_list)==0) {
616	return -1;
617	} # had an error, will be passed down pipeline
618
619	foreach my $conv_filename ( @$conv_filename_list ) {
620	if (! -e "$conv_filename") {return -1;}
621	$self->{'conv_filename'} = $conv_filename; # is this used anywhere?
622	$self->convert_post_process($conv_filename);
623
624	my $secondary_plugins = $self->{'secondary_plugins'};
625	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
626
627	if ($num_secondary_plugins == 0) {
628	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
629	return 0; # effectively block it
630	}
631
632	my @plugin_names = keys %$secondary_plugins;
633	my $plugin_name = shift @plugin_names;
634
635	if ($num_secondary_plugins > 1) {
636	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
637	}
638
639	my $secondary_plugin = $secondary_plugins->{$plugin_name};
640
641	# note: metadata is not carried on to the next level
642	my ($rv,$doc_obj)
643	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
644	$block_hash, $metadata, $processor, $maxdocs, $total_count,
645	$gli);
646
647	print STDERR "**** $conv_filename => returned rv = $rv\n";
648
649	if ((defined $rv) && ($rv>=0)) {
650	$successful_rv = 1;
651	}
652
653	# Override previous gsdlsourcefilename set by secondary plugin
654	my $collect_file = &util::filename_within_collection($filename_full_path);
655	my $collect_conv_file = &util::filename_within_collection($conv_filename);
656	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
657	$doc_obj->set_converted_filename($collect_conv_file);
658
659	my ($filemeta) = $file =~ /([^\\\/]+)$/;
660	$self->set_Source_metadata($doc_obj, $filemeta);
661	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
662	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
663
664	if ($self->{'cover_image'}) {
665	$self->associate_cover_image($doc_obj, $filename_full_path);
666	}
667
668	# do plugin specific processing of doc_obj
669	unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
670	print STDERR "***** process returned undef: $base_dir $file\n";
671	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
672	return -1;
673	}
674	# do any automatic metadata extraction
675	$self->auto_extract_metadata ($doc_obj);
676
677	# have we found a Title??
678	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
679
680	# add an OID
681	$self->add_OID($doc_obj);
682	# process the document
683	$processor->process($doc_obj);
684
685	$self->{'num_processed'} ++;
686	}
687
688	return $successful_rv;
689	}
690
691	sub process {
692
693	return 1;
694	}
695
696	# do we need this? sec pluginn process would have already been called as part of read_into_doc_obj??
697	sub process_old {
698	my $self = shift (@_);
699	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
700
701
702	my $secondary_plugins = $self->{'secondary_plugins'};
703	my @plugin_names = keys %$secondary_plugins;
704	my $plugin_name = shift @plugin_names; # already checked there is only one
705
706	my $secondary_plugin = $secondary_plugins->{$plugin_name};
707
708	my $result = $secondary_plugin->process(@_);
709
710	return $result;
711	}
712
713
714	# Called at the beginning of the XML document.
715	sub xml_start_document {
716	my $self = shift(@_);
717	my ($expat) = @_;
718
719	$self->{'rdf_desc'} = {};
720	}
721
722
723	# Called for DOCTYPE declarations - use die to bail out if this doctype
724	# is not meant for this plugin
725	sub xml_doctype {
726	my $self = shift(@_);
727	my ($expat, $name, $sysid, $pubid, $internal) = @_;
728
729	die "" if ($name !~ /^rdf:RDF$/);
730
731	my $outhandle = $self->{'outhandle'};
732	print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
733
734	}
735
736	# Called for every start tag. The $_ variable will contain a copy of the
737	# tag and the %_ variable will contain the element's attributes.
738	sub xml_start_tag {
739	my $self = shift(@_);
740	my ($expat, $element) = @_;
741
742	if ($element eq "rdf:Description") {
743
744	my $about_key = $_{'about'};
745
746	my $rdf_desc = $self->{'rdf_desc'};
747	$rdf_desc->{$about_key} = {};
748
749	$self->{'about_key'} = $about_key;
750	$self->{'index_text'} = "";
751	$self->{'pp_text'} = "<table width=\"100%\">\n";
752
753
754	}
755	elsif (defined $self->{'about_key'}) {
756	$self->{'metadata_name'} = $element;
757	$self->{'metadata_value'} = "";
758	}
759
760	}
761
762	# Called for every end tag. The $_ variable will contain a copy of the tag.
763	sub xml_end_tag {
764	my $self = shift(@_);
765	my ($expat, $element) = @_;
766
767	if ($element eq "rdf:Description") {
768	$self->{'pp_text'} .= "</table>\n";
769	## ghtml::htmlsafe($self->{'pp_text'});
770
771
772	my $about_key = $self->{'about_key'};
773	my $about = $self->{'rdf_desc'}->{$about_key};
774	$about->{'IndexText'} = $self->{'index_text'};
775	$about->{'MetadataTable'} = $self->{'pp_text'};
776
777
778	$self->{'about_key'} = undef;
779	$self->{'index_text'} = undef;
780	$self->{'pp_text'} = undef;
781
782	}
783	elsif (defined $self->{'metadata_name'}) {
784	my $metadata_name = $self->{'metadata_name'};
785	if ($element eq $metadata_name) {
786	my $metadata_value = $self->{'metadata_value'};
787
788	my $about_key = $self->{'about_key'};
789	my $about = $self->{'rdf_desc'}->{$about_key};
790	$about->{$metadata_name} = $metadata_value;
791
792	$self->{'index_text'} .= "$metadata_value\n";
793	$self->{'pp_text'} .= " <tr><td>$metadata_name</td><td>$metadata_value</td></tr>\n";
794
795	$self->{'metadata_name'} = undef;
796	$self->{'metadata_value'} = undef;
797	}
798	}
799	}
800
801	# Called just before start or end tags with accumulated non-markup text in
802	# the $_ variable.
803	sub xml_text {
804	my $self = shift(@_);
805	my ($expat) = @_;
806
807	if (defined $self->{'metadata_name'}) {
808	$self->{'metadata_value'} .= $_;
809	}
810	}
811
812	# Called at the end of the XML document.
813	sub xml_end_document {
814	my $self = shift(@_);
815	my ($expat) = @_;
816	}
817
818
819	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: