Context Navigation

source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 29013

Last change on this file since 29013 was 29013, checked in by ak19, 10 years ago
Following Dr Bainbridge's suggestion on determining a document's title which forms the anchor text for its RSS link.
Property svn:keywords set to `Author Date Id Revision`
File size: 37.1 KB

Rev	Line
[12330]	1	###########################################################################
	2	#
[17202]	3	# BasePlugout.pm -- base class for all the plugout modules
[12330]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 2006 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[17202]	26	package BasePlugout;
[12330]	27
	28	eval {require bytes};
	29
	30	use strict;
	31	no strict 'subs';
[12459]	32	no strict 'refs';
[12330]	33
[21565]	34	use dbutil;
[12330]	35	use gsprintf 'gsprintf';
	36	use printusage;
[12546]	37	use parse2;
[27306]	38	use util;
	39	use FileUtils;
[12330]	40
	41	# suppress the annoying "subroutine redefined" warning that various
	42	# gets cause under perl 5.6
	43	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
	44
	45	my $arguments = [
	46	{ 'name' => "xslt_file",
	47	'desc' => "{BasPlugout.xslt_file}",
	48	'type' => "string",
	49	'reqd' => "no",
[20320]	50	'deft' => "",
[12330]	51	'hiddengli' => "no"},
[27499]	52	{ 'name' => "subdir_split_length",
	53	'desc' => "{BasPlugout.subdir_split_length}",
	54	'type' => "int",
	55	'reqd' => "no",
	56	'deft' => "8",
	57	'hiddengli' => "no"},
	58	{ 'name' => "subdir_hash_prefix",
	59	'desc' => "{BasPlugout.subdir_hash_prefix}",
	60	'type' => "flag",
	61	'reqd' => "no",
[28021]	62	'deft' => "0",
[27646]	63	'hiddengli' => "no"},
[28642]	64	{ 'name' => "gzip_output",
	65	'desc' => "{BasPlugout.gzip_output}",
	66	'type' => "flag",
	67	'reqd' => "no",
	68	'hiddengli' => "no"},
	69	{ 'name' => "verbosity",
	70	'desc' => "{BasPlugout.verbosity}",
	71	'type' => "int",
	72	'deft' => "0",
	73	'reqd' => "no",
	74	'hiddengli' => "no"},
	75	{ 'name' => "output_info",
	76	'desc' => "{BasPlugout.output_info}",
	77	'type' => "string",
	78	'reqd' => "yes",
	79	'hiddengli' => "yes"},
	80	{ 'name' => "output_handle",
	81	'desc' => "{BasPlugout.output_handle}",
	82	'type' => "string",
	83	'deft' => 'STDERR',
	84	'reqd' => "no",
	85	'hiddengli' => "yes"},
	86	{ 'name' => "debug",
	87	'desc' => "{BasPlugout.debug}",
	88	'type' => "flag",
	89	'reqd' => "no",
	90	'hiddengli' => "yes"},
[27646]	91	{ 'name' => 'no_rss',
[28707]	92	'desc' => "{BasPlugout.no_rss}",
[27646]	93	'type' => 'flag',
	94	'reqd' => 'no',
[28642]	95	'hiddengli' => 'yes'},
[29013]	96	{ 'name' => 'rss_title',
	97	'desc' => "{BasPlugout.rss_title}",
	98	'type' => 'string',
	99	'deft' => 'dc.Title',
	100	'reqd' => 'no',
	101	'hiddengli' => 'yes'},
[28707]	102	{ 'name' => "no_auxiliary_databases",
	103	'desc' => "{BasPlugout.no_auxiliary_databases}",
	104	'type' => "flag",
	105	'reqd' => "no",
	106	'hiddengli' => "yes"}
	107
[12330]	108	];
	109
[17202]	110	my $options = { 'name' => "BasePlugout",
[12330]	111	'desc' => "{BasPlugout.desc}",
	112	'abstract' => "yes",
	113	'inherits' => "no",
	114	'args' => $arguments};
	115
	116	sub new
	117	{
	118	my $class = shift (@_);
	119
	120	my ($plugoutlist,$args,$hashArgOptLists) = @_;
	121	push(@$plugoutlist, $class);
	122
[20320]	123	my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
[12330]	124
[17202]	125	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	126	push(@{$hashArgOptLists->{"OptList"}},$options);
[12330]	127
	128	my $self = {};
	129	$self->{'plugout_type'} = $class;
	130	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
	131	$self->{"info_only"} = 0;
	132
	133	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
	134	# the args, just return the object.
	135	foreach my $strArg (@{$args})
	136	{
	137	if(defined $strArg && $strArg eq "-gsdlinfo")
	138	{
	139	$self->{"info_only"} = 1;
	140	return bless $self, $class;
	141	}
	142	}
	143
	144	delete $self->{"info_only"};
	145
[12546]	146	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[12330]	147	{
	148	my $classTempClass = bless $self, $class;
[20320]	149	print STDERR "<BadPlugout d=$plugout_name>\n";
	150	&gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
[12330]	151	$classTempClass->print_txt_usage(""); # Use default resource bundle
	152	die "\n";
	153	}
	154
	155
[12603]	156	if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
[12330]	157	{
[20320]	158	my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
	159	if (!defined $full_file_path) {
[28707]	160	print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
[20320]	161	die "\n";
	162	}
	163	$self->{'xslt_file'} = $full_file_path;
[12330]	164	}
	165
[28642]	166	# for group processing
[28021]	167	$self->{'gs_count'} = 0;
[28642]	168	$self->{'group_position'} = 1;
[12330]	169
[28021]	170	$self->{'keep_import_structure'} = 0;
[12330]	171
[28707]	172	$self->{'generate_databases'} = 1;
	173	if ($self->{'no_auxiliary_databases'}) {
	174	$self->{'generate_databases'} = 0;
	175	}
	176	undef $self->{'no_auxiliary_databases'};
[12330]	177	return bless $self, $class;
	178
	179	}
	180
[28707]	181	# implement this in subclass if you want to do some initialization after
	182	# loading and setting parameters, and before processing the documents.
	183	sub begin {
	184
	185	my $self= shift (@_);
	186
	187	}
[12330]	188	sub print_xml_usage
	189	{
	190	my $self = shift(@_);
	191	my $header = shift(@_);
[12628]	192	my $high_level_information_only = shift(@_);
[28021]	193
[12330]	194	# XML output is always in UTF-8
	195	gsprintf::output_strings_in_UTF8;
	196
	197	if ($header) {
	198	&PrintUsage::print_xml_header("plugout");
	199	}
[12628]	200	$self->print_xml($high_level_information_only);
[12330]	201	}
	202
	203
	204	sub print_xml
	205	{
	206	my $self = shift(@_);
[12628]	207	my $high_level_information_only = shift(@_);
	208
[12330]	209	my $optionlistref = $self->{'option_list'};
	210	my @optionlist = @$optionlistref;
	211	my $plugoutoptions = shift(@$optionlistref);
	212	return if (!defined($plugoutoptions));
	213
	214	gsprintf(STDERR, "<PlugoutInfo>\n");
	215	gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
	216	my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
	217	$desc =~ s/</&lt;/g; # doubly escaped
	218	$desc =~ s/>/&gt;/g;
	219	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
	220	gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
	221	gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
[12628]	222	unless (defined($high_level_information_only)) {
	223	gsprintf(STDERR, " <Arguments>\n");
	224	if (defined($plugoutoptions->{'args'})) {
	225	&PrintUsage::print_options_xml($plugoutoptions->{'args'});
	226	}
	227	gsprintf(STDERR, " </Arguments>\n");
[12330]	228
[12628]	229	# Recurse up the plugout hierarchy
	230	$self->print_xml();
[12330]	231	}
	232	gsprintf(STDERR, "</PlugoutInfo>\n");
	233	}
	234
	235
	236	sub print_txt_usage
	237	{
	238	my $self = shift(@_);
	239
	240	# Print the usage message for a plugout (recursively)
	241	my $descoffset = $self->determine_description_offset(0);
	242	$self->print_plugout_usage($descoffset, 1);
	243	}
	244
	245	sub determine_description_offset
	246	{
	247	my $self = shift(@_);
	248	my $maxoffset = shift(@_);
	249
	250	my $optionlistref = $self->{'option_list'};
	251	my @optionlist = @$optionlistref;
	252	my $plugoutoptions = pop(@$optionlistref);
	253	return $maxoffset if (!defined($plugoutoptions));
	254
	255	# Find the length of the longest option string of this download
	256	my $plugoutargs = $plugoutoptions->{'args'};
	257	if (defined($plugoutargs)) {
	258	my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
	259	if ($longest > $maxoffset) {
	260	$maxoffset = $longest;
	261	}
	262	}
	263
	264	# Recurse up the download hierarchy
	265	$maxoffset = $self->determine_description_offset($maxoffset);
	266	$self->{'option_list'} = \@optionlist;
	267	return $maxoffset;
	268	}
	269
	270
	271	sub print_plugout_usage
	272	{
	273	my $self = shift(@_);
	274	my $descoffset = shift(@_);
	275	my $isleafclass = shift(@_);
	276
	277	my $optionlistref = $self->{'option_list'};
	278	my @optionlist = @$optionlistref;
	279	my $plugoutoptions = shift(@$optionlistref);
	280	return if (!defined($plugoutoptions));
	281
	282	my $plugoutname = $plugoutoptions->{'name'};
	283	my $plugoutargs = $plugoutoptions->{'args'};
	284	my $plugoutdesc = $plugoutoptions->{'desc'};
	285
	286	# Produce the usage information using the data structure above
	287	if ($isleafclass) {
	288	if (defined($plugoutdesc)) {
	289	gsprintf(STDERR, "$plugoutdesc\n\n");
	290	}
	291	gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
	292	}
	293
	294	# Display the download options, if there are some
	295	if (defined($plugoutargs)) {
	296	# Calculate the column offset of the option descriptions
	297	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
	298
	299	if ($isleafclass) {
	300	gsprintf(STDERR, " {common.specific_options}:\n");
	301	}
	302	else {
	303	gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
	304	}
	305
	306	# Display the download options
	307	&PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
	308	}
	309
	310	# Recurse up the download hierarchy
	311	$self->print_plugout_usage($descoffset, 0);
	312	$self->{'option_list'} = \@optionlist;
	313	}
	314
	315
	316	sub error
	317	{
	318	my ($strFunctionName,$strError) = @_;
	319	{
[17202]	320	print "Error occoured in BasePlugout.pm\n".
[12330]	321	"In Function: ".$strFunctionName."\n".
	322	"Error Message: ".$strError."\n";
	323	exit(-1);
	324	}
	325	}
	326
[27352]	327	# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
[12330]	328	sub set_OIDtype {
	329	my $self = shift (@_);
[12618]	330	my ($type, $metadata) = @_;
[12330]	331
[27352]	332	if ($type =~ /^(hash\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
[12330]	333	$self->{'OIDtype'} = $type;
	334	} else {
	335	$self->{'OIDtype'} = "hash";
	336	}
[12618]	337	if ($type =~ /^assigned$/) {
	338	if (defined $metadata) {
	339	$self->{'OIDmetadata'} = $metadata;
	340	} else {
	341	$self->{'OIDmetadata'} = "dc.Identifier";
	342	}
	343	}
[12330]	344	}
	345
	346	sub set_output_dir
	347	{
	348	my $self = shift @_;
	349	my ($output_dir) = @_;
	350
	351	$self->{'output_dir'} = $output_dir;
	352	}
	353
	354	sub setoutputdir
	355	{
	356	my $self = shift @_;
	357	my ($output_dir) = @_;
	358
	359	$self->{'output_dir'} = $output_dir;
	360	}
	361
	362	sub get_output_dir
	363	{
	364	my $self = shift (@_);
	365
	366	return $self->{'output_dir'};
	367	}
	368
	369	sub getoutputdir
	370	{
	371	my $self = shift (@_);
	372
	373	return $self->{'output_dir'};
	374	}
	375
	376	sub getoutputinfo
	377	{
	378	my $self = shift (@_);
	379
	380	return $self->{'output_info'};
	381	}
	382
	383
	384	sub get_output_handler
	385	{
	386	my $self = shift (@_);
	387
	388	my ($output_file_name) = @_;
	389
[27500]	390	my $fh;
	391	&FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
	392
	393	return $fh;
[12330]	394	}
	395
	396	sub release_output_handler
	397	{
	398	my $self = shift (@_);
	399	my ($outhandler) = @_;
	400
	401	close($outhandler);
	402
	403	}
	404
	405	sub output_xml_header {
	406	my $self = shift (@_);
	407	my ($handle,$docroot,$nondoctype) = @_;
	408
[23824]	409
	410	#print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
	411
	412	#For Dspace must be UTF in lower case
	413	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
[12330]	414
	415	if (!defined $nondoctype){
[22818]	416	my $doctype = (defined $docroot) ? $docroot : "Section";
	417
	418	# Used to be '<!DOCTYPE Archive SYSTEM ...'
	419
[28642]	420	print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
[12330]	421	}
	422
	423	print $handle "<$docroot>\n" if defined $docroot;
	424	}
	425
	426	sub output_xml_footer {
	427	my $self = shift (@_);
	428	my ($handle,$docroot) = @_;
	429	print $handle "</$docroot>\n" if defined $docroot;
	430	}
	431
[23824]	432
	433	sub output_general_xml_header
	434	{
	435	my $self = shift (@_);
	436	my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
	437
	438	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
	439
	440	if (defined $opt_dtd) {
	441	my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
	442	print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
	443	}
	444
	445	if (defined $docroot) {
	446	my $full_docroot = $docroot;
	447	if (defined $opt_attributes) {
	448	$full_docroot .= " $opt_attributes";
	449	}
	450
	451	print $handle "<$full_docroot>\n"
	452	}
	453	}
	454
	455	sub output_general_xml_footer
	456	{
	457	output_xml_footer(@_);
	458	}
	459
[28642]	460	# This is called by the plugins after read_into_doc_obj generates the doc_obj.
[12330]	461	sub process {
	462	my $self = shift (@_);
	463	my ($doc_obj) = @_;
[28642]	464
	465	my $output_info = $self->{'output_info'};
	466	return if (!defined $output_info);
	467
[23939]	468	# for OAI purposes
[12330]	469	$doc_obj->set_lastmodified();
[23939]	470	$doc_obj->set_oailastmodified();
[12330]	471
[28642]	472	# find out which directory to save to
	473	my $doc_dir = "";
	474	if ($self->is_group()) {
	475	$doc_dir = $self->get_group_doc_dir($doc_obj);
	476	} else {
	477	$doc_dir = $self->get_doc_dir($doc_obj);
[12330]	478	}
[28642]	479
[12330]	480	##############################
	481	# call subclass' saveas method
	482	##############################
	483	$self->saveas($doc_obj,$doc_dir);
[17087]	484
[28642]	485	# write out data to archiveinf-doc.db
[28707]	486	if ($self->{'generate_databases'}) {
	487	$self->archiveinf_db($doc_obj);
	488	}
[28642]	489	if ($self->is_group()) {
	490	$self->{'gs_count'}++; # do we want this for all cases?
	491	$self->{'group_position'}++;
	492	}
[12330]	493	}
	494
[12363]	495	sub store_output_info_reference {
[12330]	496	my $self = shift (@_);
	497	my ($doc_obj) = @_;
	498
[12363]	499	my $output_info = $self->{'output_info'};
	500	my $metaname = $self->{'sortmeta'};
[28642]	501
	502	my $group_position;
	503	if ($self->is_group()) {
	504	$group_position = $self->{'group_position'};
	505	}
[12363]	506	if (!defined $metaname \|\| $metaname !~ /\S/) {
[27697]	507	my $OID = $doc_obj->get_OID();
[28642]	508	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
[12363]	509	return;
	510	}
[27697]	511
[28642]	512	if ($metaname eq "OID") { # sort by OID
[27697]	513	my $OID = $doc_obj->get_OID();
[28642]	514	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
[27697]	515	return;
[28642]	516	}
[12330]	517
[12363]	518	my $metadata = "";
	519	my $top_section = $doc_obj->get_top_section();
	520
	521	my @commameta_list = split(/,/, $metaname);
	522	foreach my $cmn (@commameta_list) {
	523	my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
	524	if ($meta) {
	525	# do remove prefix/suffix - this will apply to all values
	526	$meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
	527	$meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
	528	$meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
	529	$metadata .= $meta if ($meta);
	530	}
[12330]	531	}
	532
	533	# store reference in the output_info
[28642]	534	$output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
[12330]	535
	536	}
	537
	538
[28642]	539
	540	sub saveas {
	541	my $self = shift (@_);
	542	my ($doc_obj, $doc_dir) = @_;
	543
	544	die "Basplug::saveas function must be implemented in sub classes\n";
	545	}
	546
	547	sub get_group_doc_dir {
[12330]	548	my $self = shift (@_);
	549	my ($doc_obj) = @_;
[28642]	550
	551	my $outhandle = $self->{'output_handle'};
[12330]	552	my $OID = $doc_obj->get_OID();
	553	$OID = "NULL" unless defined $OID;
	554
	555	my $groupsize = $self->{'group_size'};
	556	my $gs_count = $self->{'gs_count'};
	557	my $open_new_file = (($gs_count % $groupsize)==0);
	558
[28642]	559	my $doc_dir;
[12330]	560
[28642]	561	if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
	562	# if we have some assoc files, then we will need to start a new file
	563	if ($self->{'verbosity'} > 2) {
	564	print $outhandle " Starting a archives folder for $OID as it has associated files\n";
[12330]	565	}
[28642]	566	$open_new_file = 1;
	567	}
	568
	569	# opening a new file
	570	if (($open_new_file) \|\| !defined($self->{'gs_doc_dir'})) {
	571	# first we close off the old output
	572	if ($gs_count>0)
	573	{
	574	return if (!$self->close_group_output());
	575	}
[12330]	576
[28642]	577	# this will create the directory
	578	$doc_dir = $self->get_doc_dir ($doc_obj);
	579	$self->{'new_doc_dir'} = 1;
	580	$self->{'gs_doc_dir'} = $doc_dir;
	581	$self->{'group_position'} = 1;
[12330]	582	}
[28642]	583	else {
	584	$doc_dir = $self->{'gs_doc_dir'};
	585	$self->{'new_doc_dir'} = 0;
	586	}
	587	return $doc_dir;
[12330]	588
	589	}
	590	sub get_doc_dir {
[28642]	591
[12330]	592	my $self = shift (@_);
[28642]	593	my ($doc_obj) = @_;
[12330]	594
[28642]	595	my $OID = $doc_obj->get_OID();
	596	$OID = "NULL" unless defined $OID;
	597
[12330]	598	my $working_dir = $self->get_output_dir();
[19775]	599	my $working_info = $self->{'output_info'};
[12330]	600	return if (!defined $working_info);
	601
	602	my $doc_info = $working_info->get_info($OID);
	603	my $doc_dir = '';
	604
[16252]	605	if (defined $doc_info && scalar(@$doc_info) >= 1)
	606	{
	607	# This OID already has an archives directory, so use it again
[12330]	608	$doc_dir = $doc_info->[0];
	609	$doc_dir =~ s/\/?((doc(mets)?)\|(dublin_core))\.xml(\.gz)?$//;
[16252]	610	}
	611	elsif ($self->{'keep_import_structure'})
	612	{
[28642]	613	my $source_filename = $doc_obj->get_source_filename();
[12330]	614	$source_filename = &File::Basename::dirname($source_filename);
	615	$source_filename =~ s/[\\\/]+/\//g;
	616	$source_filename =~ s/\/$//;
	617
	618	$doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
[16252]	619	}
[12330]	620
[16252]	621	# We have to use a new archives directory for this document
	622	if ($doc_dir eq "")
	623	{
	624	$doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
[12330]	625	}
	626
[28642]	627	&FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
[16252]	628
[12330]	629	return $doc_dir;
	630	}
	631
[27880]	632
[27513]	633	## @function get_new_doc_dir()
	634	#
	635	# Once a doc object is ready to write to disk (and hence has a nice OID),
	636	# generate a unique subdirectory to write the information to.
[28021]	637	#
[27513]	638	# - create the directory as part of this call, to try and avoid race conditions
	639	# found in parallel processing [jmt12]
	640	#
[28021]	641	# @todo figure out what the rule regarding $work_info->size() is meant to do
	642	#
	643	# @todo determine what $self->{'group'} is, and whether it should affect
	644	# directory creation
	645	#
[27513]	646	sub get_new_doc_dir
	647	{
	648	my $self = shift (@_);
	649	my($working_info,$working_dir,$OID) = @_;
[22232]	650
[27513]	651	my $doc_dir = "";
	652	my $doc_dir_rest = $OID;
[22232]	653
[27513]	654	# remove any \ and / from the OID
	655	$doc_dir_rest =~ s/[\\\/]//g;
[22232]	656
[27513]	657	# Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
	658	if ($ENV{'GSDLOS'} =~ /^windows$/i)
	659	{
	660	$doc_dir_rest =~ s/\://g;
	661	}
[27504]	662
[27513]	663	# we generally create a unique directory by adding consequtive fragments of
	664	# the document identifier (split by some predefined length - defaulting to
	665	# 8) until we find a directory that doesn't yet exist. Note that directories
	666	# that contain a document have a suffix ".dir" (whereas those that contain
	667	# only subdirectories have no suffix).
	668	my $doc_dir_num = 0; # how many directories deep we are
	669	my $created_directory = 0; # have we successfully created a new directory
	670	do
	671	{
	672	# (does this work on windows? - jmt12)
	673	if ($doc_dir_num > 0)
	674	{
	675	$doc_dir .= '/';
	676	}
[28021]	677
[27513]	678	# the default matching pattern grabs the next 'subdir_split_length'
	679	# characters of the OID to act as the next subdirectory
	680	my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
[28021]	681
[27513]	682	# Do we count any "HASH" prefix against the split length limit?
	683	if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
	684	{
	685	$pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
	686	}
[28021]	687
[27513]	688	# Note the use of 's' to both capture the next chuck of OID and to remove
	689	# it from OID at the same time
	690	if ($doc_dir_rest =~ s/$pattern//i)
	691	{
	692	$doc_dir .= $1;
	693	$doc_dir_num++;
[28021]	694
	695	my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
	696	if(!FileUtils::directoryExists($full_doc_dir))
	697	{
	698	&FileUtils::makeAllDirectories($full_doc_dir);
	699	$created_directory = 1;
	700	}
	701
	702	###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
	703	###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
	704	###rint STDERR " - rest: $doc_dir_rest\n";
	705	###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
	706	###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
[27513]	707	}
	708	}
	709	while ($doc_dir_rest ne '' && ($created_directory == 0 \|\| ($working_info->size() >= 1024 && $doc_dir_num < 2)));
[27504]	710
[27513]	711	# not unique yet? Add on an incremental suffix until we are unique
	712	my $i = 1;
	713	my $doc_dir_base = $doc_dir;
	714	while ($created_directory == 0)
	715	{
	716	$doc_dir = $doc_dir_base . '-' . $i;
	717	$created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
	718	$i++;
	719	}
	720
	721	# in theory this should never happen
	722	if (!$created_directory)
	723	{
	724	die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
	725	}
	726
	727	return $doc_dir . '.dir';
[12330]	728	}
[27513]	729	## get_new_doc_dir()
[12330]	730
[27513]	731
[12330]	732	sub process_assoc_files {
	733	my $self = shift (@_);
	734	my ($doc_obj, $doc_dir, $handle) = @_;
	735
	736	my $outhandle = $self->{'output_handle'};
	737
	738	my $output_dir = $self->get_output_dir();
	739	return if (!defined $output_dir);
	740
[27505]	741	&FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
[12330]	742
[27306]	743	my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
[27505]	744	&FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
[12330]	745
	746	my @assoc_files = ();
	747	my $filename;;
	748
	749	my $source_filename = $doc_obj->get_source_filename();
	750
	751	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
	752
	753	if (defined $collect_dir) {
	754	my $dirsep_regexp = &util::get_os_dirsep();
	755
	756	if ($collect_dir !~ /$dirsep_regexp$/) {
	757	$collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
	758	}
	759
	760	# This test is never going to fail on Windows -- is this a problem?
	761
	762	if ($source_filename !~ /^$dirsep_regexp/) {
[27306]	763	$source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
[12330]	764	}
	765	}
	766
	767
	768	# set the assocfile path (even if we have no assoc files - need this for lucene)
	769	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
	770	"assocfilepath",
	771	"$doc_dir");
	772	foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
	773	my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
	774	$dir = "" unless defined $dir;
	775
[23363]	776	my $utf8_real_filename = $assoc_file_rec->[0];
	777
[12330]	778	# for some reasons the image associate file has / before the full path
[23363]	779	$utf8_real_filename =~ s/^\\(.*)/$1/i;
	780
[23387]	781	## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
	782	my $real_filename = $utf8_real_filename;
	783	$real_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]	784
[27505]	785	if (&FileUtils::fileExists($real_filename)) {
[12330]	786
[27306]	787	$filename = &FileUtils::filenameConcatenate($working_dir, $afile);
[12330]	788
[27306]	789	&FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
	790
[12330]	791	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
	792	"gsdlassocfile",
	793	"$afile:$assoc_file_rec->[2]:$dir");
[23363]	794	} elsif ($self->{'verbosity'} > 1) {
[17202]	795	print $outhandle "BasePlugout::process couldn't copy the associated file " .
[12330]	796	"$real_filename to $afile\n";
	797	}
	798	}
	799	}
	800
[17087]	801
[19494]	802	sub process_metafiles_metadata
	803	{
	804	my $self = shift (@_);
	805	my ($doc_obj) = @_;
	806
	807	my $top_section = $doc_obj->get_top_section();
	808	my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
	809
	810	foreach my $metafile_pair (@$metafiles) {
[19516]	811	my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
[19494]	812
	813	$doc_obj->metadata_file($full_metafile,$metafile);
	814	}
	815
	816	$doc_obj->delete_metadata($top_section,"gsdlmetafile");
	817	}
	818
	819	sub archiveinf_files_to_field
	820	{
	821	my $self = shift(@_);
	822	my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
	823
	824	foreach my $file_rec (@$files) {
[20777]	825	my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
	826	my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
[19494]	827	# for some reasons the image associate file has / before the full path
	828	$real_filename =~ s/^\\(.*)/$1/i;
	829
[23387]	830	my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]	831
[27505]	832	if (&FileUtils::fileExists($raw_filename)) {
[23363]	833
[20763]	834	# if (defined $collect_dir) {
	835	# my $collect_dir_re_safe = $collect_dir;
[24829]	836	# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
[20763]	837	# $collect_dir_re_safe =~ s/\./\\./g;##
[19494]	838
[20763]	839	# $real_filename =~ s/^$collect_dir_re_safe//;
	840	# }
[22328]	841
[20801]	842	if (defined $reverse_lookups) {
	843	$reverse_lookups->{$real_filename} = 1;
	844	}
[28211]	845
	846	if($field =~ m@assoc-file\|src-file\|meta-file@) {
	847	$raw_filename = &util::abspath_to_placeholders($raw_filename);
	848	}
	849
	850	### push(@{$oid_files->{$field}},$full_file);
[23363]	851	push(@{$oid_files->{$field}},$raw_filename);
[19494]	852	}
	853	else {
[19516]	854	print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
[19494]	855	}
	856	}
	857	}
	858
[21564]	859	sub archiveinf_db
[17087]	860	{
	861	my $self = shift (@_);
	862	my ($doc_obj) = @_;
	863
	864	my $verbosity = $self->{'verbosity'};
	865
	866	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
	867	if (defined $collect_dir) {
	868	my $dirsep_regexp = &util::get_os_dirsep();
	869
	870	if ($collect_dir !~ /$dirsep_regexp$/) {
	871	# ensure there is a slash at the end
	872	$collect_dir .= &util::get_dirsep();
	873	}
	874	}
	875
	876	my $oid = $doc_obj->get_OID();
[19829]	877	my $source_filename = $doc_obj->get_unmodified_source_filename();
[18441]	878	my $working_info = $self->{'output_info'};
	879	my $doc_info = $working_info->get_info($oid);
[20651]	880
[28642]	881	my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
[22328]	882	# doc_file is the path to the archive doc.xml. Make sure it has unix
	883	# slashes, then if the collection is copied to linux, it can be built without reimport
	884	$doc_file =~ s/\\/\//g;
[18441]	885	my $oid_files = { 'doc-file' => $doc_file,
	886	'index-status' => $index_status,
	887	'src-file' => $source_filename,
[20747]	888	'sort-meta' => $sortmeta,
[19775]	889	'assoc-file' => [],
	890	'meta-file' => [] };
[28642]	891	if (defined $group_position) {
	892	$oid_files->{'group-position'} = $group_position;
	893	}
[19494]	894	my $reverse_lookups = { $source_filename => "1" };
[17087]	895
	896
[20777]	897	$self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
[19494]	898	$collect_dir,$oid_files,$reverse_lookups);
[17087]	899
[17120]	900
[19775]	901	$self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
[20801]	902	$collect_dir,$oid_files);
[17087]	903
[21584]	904	# Get the infodbtype value for this collection from the arcinfo object
	905	my $infodbtype = $self->{'output_info'}->{'infodbtype'};
[17087]	906	my $output_dir = $self->{'output_dir'};
	907
[21584]	908	my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
[17087]	909
[18528]	910	##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
[17087]	911
[27646]	912	if (!$self->{'no_rss'})
	913	{
	914	if (($oid_files->{'index-status'} eq "I") \|\| ($oid_files->{'index-status'} eq "R")) {
[24958]	915	my $top_section = $doc_obj->get_top_section();
[29013]	916
	917	# rss_title can be set in collect.cfg as follows:
	918	# plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
	919	# rss_title is a semi-colon or comma-separated list of the metadata field names that should
	920	# be consulted in order to obtain a Title (anchor text) for the RSS document link.
	921	# If not specified, rss_title will default to dc.Title, and fall back on Untitled
	922	my $metafieldnames = $self->{'rss_title'};
	923	my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
	924	my $titles;
	925	#@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
	926	foreach my $metafieldname (@metafieldarray) {
	927	$metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
	928	$titles = $doc_obj->get_metadata($top_section,$metafieldname);
[24958]	929
[29013]	930	if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
	931	last; # break out of the loop
	932	}
[28999]	933	}
[29013]	934
	935	# if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
	936	# otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
	937	if(scalar(@$titles) == 0) { #&& $metafieldnames !~ [email protected]@) {
	938	push(@$titles, "Untitled");
	939	}
	940
	941	# encode basic html entities like <>"& in the title(s), since the & char can break RSS links
	942	for (my $i = 0; $i < scalar(@$titles); $i++) {
	943	&ghtml::htmlsafe(@$titles[$i]);
	944	}
[28999]	945
	946	my $dc_title = join("; ", @$titles);
	947
[24958]	948	if ($oid_files->{'index-status'} eq "R") {
	949	$dc_title .= " (Updated)";
	950	}
	951
[28021]	952	my $rss_entry = "<item>\n";
	953	$rss_entry .= " <title>$dc_title</title>\n";
[28997]	954	if(&util::is_gs3()) {
[28996]	955	$rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
	956	} else {
	957	$rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
	958	}
[28021]	959	$rss_entry .= "</item>";
	960
	961	if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
	962	{
	963	my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
	964	my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
	965	&dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
	966	&dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
	967	}
	968	else
	969	{
	970	my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
	971	my $rss_fh;
[28804]	972	if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
[28021]	973	{
	974	print $rss_fh $rss_entry . "\n";
[27511]	975	&FileUtils::closeFileHandle($rss_filename, \$rss_fh);
[28021]	976	}
	977	else
	978	{
[27646]	979	print STDERR "**** Failed to open $rss_filename\n$!\n";
[28021]	980	}
	981	}
[27646]	982	}
[24958]	983	}
	984
[19775]	985	$oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
	986	$oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
[28211]	987	$oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
[19775]	988	$oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
[20747]	989	$oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
[28642]	990	if (defined $oid_files->{'group-position'}) {
	991	$oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
	992	}
[19775]	993
[21584]	994	my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
	995	&dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
	996	&dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
[19775]	997
[19494]	998	foreach my $rl (keys %$reverse_lookups) {
[19775]	999	$working_info->add_reverseinfo($rl,$oid);
[20801]	1000	}
	1001
	1002	# meta files not set in reverese entry, but need to set the metadata flag
[20814]	1003	if (defined $doc_obj->get_meta_files()) {
	1004	foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
	1005	my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
	1006	$working_info->set_meta_file_flag($full_file);
	1007	}
[20801]	1008	}
[17087]	1009	}
	1010
	1011
[12330]	1012	sub set_sortmeta {
	1013	my $self = shift (@_);
	1014	my ($sortmeta, $removeprefix, $removesuffix) = @_;
	1015
	1016	$self->{'sortmeta'} = $sortmeta;
	1017	if (defined ($removeprefix) && $removeprefix ) {
	1018	$removeprefix =~ s/^\^//; # don't need a leading ^
	1019	$self->{'removeprefix'} = $removeprefix;
	1020	}
	1021	if (defined ($removesuffix) && $removesuffix) {
	1022	$removesuffix =~ s/\$$//; # don't need a trailing $
	1023	$self->{'removesuffix'} = $removesuffix;
	1024	}
	1025	}
	1026
[28707]	1027
	1028
[12330]	1029	sub open_xslt_pipe
	1030	{
	1031	my $self = shift @_;
	1032	my ($output_file_name, $xslt_file)=@_;
	1033
[27505]	1034	return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
[20320]	1035
[27306]	1036	my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
[13013]	1037
[15140]	1038	my $mapping_file_path = "";
	1039
[13225]	1040	if ($ENV{'GSDLOS'} eq "windows"){
[27306]	1041	$java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[20320]	1042	# this file:/// bit didn't work for me on windows XP
	1043	#$xslt_file = "\"file:///".$xslt_file."\"";
	1044	#$mapping_file_path = "\"file:///";
[13225]	1045	}
	1046	else{
[27306]	1047	$java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[13225]	1048	}
	1049
	1050
[13013]	1051	$java_class_path = "\"".$java_class_path."\"";
[13024]	1052
[20902]	1053	my $cmd = "\| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
[12330]	1054
[12603]	1055	if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
[13064]	1056	my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
[14969]	1057	$cmd .= "-m $mapping_file_path";
[12603]	1058	}
[20320]	1059
[12330]	1060	open(*XMLWRITER, $cmd)
	1061	or die "can't open pipe to xslt: $!";
	1062
	1063
	1064	$self->{'xslt_writer'} = *XMLWRITER;
	1065
	1066	print XMLWRITER "<?DocStart?>\n";
	1067	print XMLWRITER "$output_file_name\n";
[14969]	1068
[12330]	1069
	1070	}
	1071
	1072
	1073	sub close_xslt_pipe
	1074	{
	1075	my $self = shift @_;
	1076
	1077
	1078	return unless defined $self->{'xslt_writer'} ;
	1079
	1080	my $xsltwriter = $self->{'xslt_writer'};
	1081
	1082	print $xsltwriter "<?DocEnd?>\n";
	1083	close($xsltwriter);
[13024]	1084
	1085	undef $self->{'xslt_writer'};
	1086
[12330]	1087	}
	1088
	1089
	1090
	1091	#the subclass should implement this method if is_group method could return 1.
	1092	sub close_group_output{
	1093	my $self = shift (@_);
	1094	}
	1095
	1096	sub is_group {
	1097	my $self = shift (@_);
	1098	return 0;
	1099	}
	1100
[13172]	1101	my $dc_set = { Title => 1,
	1102	Creator => 1,
	1103	Subject => 1,
	1104	Description => 1,
	1105	Publisher => 1,
	1106	Contributor => 1,
	1107	Date => 1,
	1108	Type => 1,
	1109	Format => 1,
	1110	Identifier => 1,
	1111	Source => 1,
	1112	Language => 1,
	1113	Relation => 1,
	1114	Coverage => 1,
	1115	Rights => 1};
	1116
	1117
	1118	# returns an XML representation of the dublin core metadata
[24404]	1119	# if dc meta is not found, try ex meta
	1120	# This method is not used by the DSpacePlugout, which has its
	1121	# own method to save its dc metadata
[13172]	1122	sub get_dc_metadata {
	1123	my $self = shift(@_);
	1124	my ($doc_obj, $section, $version) = @_;
	1125
	1126	# build up string of dublin core metadata
	1127	$section="" unless defined $section;
	1128
	1129	my $section_ptr = $doc_obj->_lookup_section($section);
	1130	return "" unless defined $section_ptr;
	1131
	1132
	1133	my $explicit_dc = {};
[24404]	1134	my $explicit_ex_dc = {};
[13172]	1135	my $explicit_ex = {};
	1136
	1137	my $all_text="";
[24404]	1138
	1139	# We want high quality dc metadata to go in first, so we store all the
	1140	# assigned dc.* values first. Then, for all those dc metadata names in
	1141	# the official dc set that are as yet unassigned, we look to see whether
	1142	# embedded ex.dc.* metadata has defined some values for them. If not,
	1143	# then for the same missing dc metadata names, we look in ex metadata.
	1144
[13172]	1145	foreach my $data (@{$section_ptr->{'metadata'}}){
	1146	my $escaped_value = &docprint::escape_text($data->[1]);
	1147	if ($data->[0]=~ m/^dc\./) {
	1148	$data->[0] =~ tr/[A-Z]/[a-z]/;
	1149
	1150	$data->[0] =~ m/^dc\.(.*)/;
	1151	my $dc_element = $1;
	1152
	1153	if (!defined $explicit_dc->{$dc_element}) {
	1154	$explicit_dc->{$dc_element} = [];
	1155	}
	1156	push(@{$explicit_dc->{$dc_element}},$escaped_value);
	1157
	1158	if (defined $version && ($version eq "oai_dc")) {
	1159	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
	1160	}
	1161	else {
	1162	# qualifier???
	1163	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
	1164	}
	1165
[24404]	1166	} elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
	1167	$data->[0] =~ m/^ex\.dc\.(.*)/;
	1168	my $ex_dc_element = $1;
	1169	my $lc_ex_dc_element = lc($ex_dc_element);
	1170
	1171	# only store the ex.dc value for this dc metaname if no dc.* was assigned for it
	1172	if (defined $dc_set->{$ex_dc_element}) {
	1173	if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
	1174	$explicit_ex_dc->{$lc_ex_dc_element} = [];
	1175	}
	1176	push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
	1177	}
	1178	}
	1179	elsif (($data->[0] =~ m/^ex\./) \|\| ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
[13172]	1180	$data->[0] =~ m/^(ex\.)?(.*)/;
[24404]	1181	my $ex_element = $2;
[13172]	1182	my $lc_ex_element = lc($ex_element);
	1183
	1184	if (defined $dc_set->{$ex_element}) {
	1185	if (!defined $explicit_ex->{$lc_ex_element}) {
	1186	$explicit_ex->{$lc_ex_element} = [];
	1187	}
	1188	push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
	1189	}
	1190	}
	1191	}
	1192
	1193	# go through dc_set and for any element not defined in explicit_dc
[14932]	1194	# that does exist in explicit_ex, add it in as metadata
[13172]	1195	foreach my $k ( keys %$dc_set ) {
	1196	my $lc_k = lc($k);
	1197
	1198	if (!defined $explicit_dc->{$lc_k}) {
[24404]	1199	# try to find if ex.dc.* defines this dc.* meta,
	1200	# if not, then look for whether there's an ex.* equivalent
[13172]	1201
[24404]	1202	if (defined $explicit_ex_dc->{$lc_k}) {
	1203	foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
	1204	my $dc_element = $lc_k;
	1205	my $escaped_value = $v;
	1206
	1207	if (defined $version && ($version eq "oai_dc")) {
	1208	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
	1209	}
	1210	else {
	1211	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
	1212	}
	1213	}
	1214	} elsif (defined $explicit_ex->{$lc_k}) {
[13172]	1215	foreach my $v (@{$explicit_ex->{$lc_k}}) {
	1216	my $dc_element = $lc_k;
	1217	my $escaped_value = $v;
	1218
	1219	if (defined $version && ($version eq "oai_dc")) {
	1220	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
	1221	}
	1222	else {
	1223	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
	1224	}
	1225	}
	1226	}
	1227	}
	1228	}
	1229
	1230	if ($all_text eq "") {
	1231	$all_text .= " There is no Dublin Core metatdata in this document\n";
	1232	}
	1233	$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
	1234
	1235	return $all_text;
	1236	}
	1237
	1238	# Build up dublin_core metadata. Priority given to dc.* over ex.*
	1239	# This method was apparently added by Jeffrey and committed by Shaoqun.
	1240	# But we don't know why it was added, so not using it anymore.
	1241	sub new_get_dc_metadata {
	1242
	1243	my $self = shift(@_);
	1244	my ($doc_obj, $section, $version) = @_;
	1245
	1246	# build up string of dublin core metadata
	1247	$section="" unless defined $section;
	1248
	1249	my $section_ptr=$doc_obj->_lookup_section($section);
	1250	return "" unless defined $section_ptr;
	1251
	1252	my $all_text = "";
	1253	foreach my $data (@{$section_ptr->{'metadata'}}){
	1254	my $escaped_value = &docprint::escape_text($data->[1]);
	1255	my $dc_element = $data->[0];
	1256
	1257	my @array = split('\.',$dc_element);
	1258	my ($type,$name);
	1259
	1260	if(defined $array[1])
	1261	{
	1262	$type = $array[0];
	1263	$name = $array[1];
	1264	}
	1265	else
	1266	{
	1267	$type = "ex";
	1268	$name = $array[0];
	1269	}
	1270
	1271	$all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
	1272	}
	1273	return $all_text;
	1274	}
	1275
	1276
[12330]	1277	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: