Context Navigation

source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 31494

Last change on this file since 31494 was 29163, checked in by kjdon, 10 years ago
need to include sorttools to get the format metadata for sorting routine
Property svn:keywords set to `Author Date Id Revision`
File size: 37.1 KB

Line
1	###########################################################################
2	#
3	# BasePlugout.pm -- base class for all the plugout modules
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2006 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasePlugout;
27
28	eval {require bytes};
29
30	use strict;
31	no strict 'subs';
32	no strict 'refs';
33
34	use dbutil;
35	use gsprintf 'gsprintf';
36	use printusage;
37	use parse2;
38	use util;
39	use FileUtils;
40	use sorttools;
41
42	# suppress the annoying "subroutine redefined" warning that various
43	# gets cause under perl 5.6
44	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
45
46	my $arguments = [
47	{ 'name' => "xslt_file",
48	'desc' => "{BasPlugout.xslt_file}",
49	'type' => "string",
50	'reqd' => "no",
51	'deft' => "",
52	'hiddengli' => "no"},
53	{ 'name' => "subdir_split_length",
54	'desc' => "{BasPlugout.subdir_split_length}",
55	'type' => "int",
56	'reqd' => "no",
57	'deft' => "8",
58	'hiddengli' => "no"},
59	{ 'name' => "subdir_hash_prefix",
60	'desc' => "{BasPlugout.subdir_hash_prefix}",
61	'type' => "flag",
62	'reqd' => "no",
63	'deft' => "0",
64	'hiddengli' => "no"},
65	{ 'name' => "gzip_output",
66	'desc' => "{BasPlugout.gzip_output}",
67	'type' => "flag",
68	'reqd' => "no",
69	'hiddengli' => "no"},
70	{ 'name' => "verbosity",
71	'desc' => "{BasPlugout.verbosity}",
72	'type' => "int",
73	'deft' => "0",
74	'reqd' => "no",
75	'hiddengli' => "no"},
76	{ 'name' => "output_info",
77	'desc' => "{BasPlugout.output_info}",
78	'type' => "string",
79	'reqd' => "yes",
80	'hiddengli' => "yes"},
81	{ 'name' => "output_handle",
82	'desc' => "{BasPlugout.output_handle}",
83	'type' => "string",
84	'deft' => 'STDERR',
85	'reqd' => "no",
86	'hiddengli' => "yes"},
87	{ 'name' => "debug",
88	'desc' => "{BasPlugout.debug}",
89	'type' => "flag",
90	'reqd' => "no",
91	'hiddengli' => "yes"},
92	{ 'name' => 'no_rss',
93	'desc' => "{BasPlugout.no_rss}",
94	'type' => 'flag',
95	'reqd' => 'no',
96	'hiddengli' => 'yes'},
97	{ 'name' => 'rss_title',
98	'desc' => "{BasPlugout.rss_title}",
99	'type' => 'string',
100	'deft' => 'dc.Title',
101	'reqd' => 'no',
102	'hiddengli' => 'yes'},
103	{ 'name' => "no_auxiliary_databases",
104	'desc' => "{BasPlugout.no_auxiliary_databases}",
105	'type' => "flag",
106	'reqd' => "no",
107	'hiddengli' => "yes"}
108
109	];
110
111	my $options = { 'name' => "BasePlugout",
112	'desc' => "{BasPlugout.desc}",
113	'abstract' => "yes",
114	'inherits' => "no",
115	'args' => $arguments};
116
117	sub new
118	{
119	my $class = shift (@_);
120
121	my ($plugoutlist,$args,$hashArgOptLists) = @_;
122	push(@$plugoutlist, $class);
123
124	my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
125
126	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
127	push(@{$hashArgOptLists->{"OptList"}},$options);
128
129	my $self = {};
130	$self->{'plugout_type'} = $class;
131	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
132	$self->{"info_only"} = 0;
133
134	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
135	# the args, just return the object.
136	foreach my $strArg (@{$args})
137	{
138	if(defined $strArg && $strArg eq "-gsdlinfo")
139	{
140	$self->{"info_only"} = 1;
141	return bless $self, $class;
142	}
143	}
144
145	delete $self->{"info_only"};
146
147	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
148	{
149	my $classTempClass = bless $self, $class;
150	print STDERR "<BadPlugout d=$plugout_name>\n";
151	&gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
152	$classTempClass->print_txt_usage(""); # Use default resource bundle
153	die "\n";
154	}
155
156
157	if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
158	{
159	my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
160	if (!defined $full_file_path) {
161	print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
162	die "\n";
163	}
164	$self->{'xslt_file'} = $full_file_path;
165	}
166
167	# for group processing
168	$self->{'gs_count'} = 0;
169	$self->{'group_position'} = 1;
170
171	$self->{'keep_import_structure'} = 0;
172
173	$self->{'generate_databases'} = 1;
174	if ($self->{'no_auxiliary_databases'}) {
175	$self->{'generate_databases'} = 0;
176	}
177	undef $self->{'no_auxiliary_databases'};
178	return bless $self, $class;
179
180	}
181
182	# implement this in subclass if you want to do some initialization after
183	# loading and setting parameters, and before processing the documents.
184	sub begin {
185
186	my $self= shift (@_);
187
188	}
189	sub print_xml_usage
190	{
191	my $self = shift(@_);
192	my $header = shift(@_);
193	my $high_level_information_only = shift(@_);
194
195	# XML output is always in UTF-8
196	gsprintf::output_strings_in_UTF8;
197
198	if ($header) {
199	&PrintUsage::print_xml_header("plugout");
200	}
201	$self->print_xml($high_level_information_only);
202	}
203
204
205	sub print_xml
206	{
207	my $self = shift(@_);
208	my $high_level_information_only = shift(@_);
209
210	my $optionlistref = $self->{'option_list'};
211	my @optionlist = @$optionlistref;
212	my $plugoutoptions = shift(@$optionlistref);
213	return if (!defined($plugoutoptions));
214
215	gsprintf(STDERR, "<PlugoutInfo>\n");
216	gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
217	my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
218	$desc =~ s/</&lt;/g; # doubly escaped
219	$desc =~ s/>/&gt;/g;
220	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221	gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
222	gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
223	unless (defined($high_level_information_only)) {
224	gsprintf(STDERR, " <Arguments>\n");
225	if (defined($plugoutoptions->{'args'})) {
226	&PrintUsage::print_options_xml($plugoutoptions->{'args'});
227	}
228	gsprintf(STDERR, " </Arguments>\n");
229
230	# Recurse up the plugout hierarchy
231	$self->print_xml();
232	}
233	gsprintf(STDERR, "</PlugoutInfo>\n");
234	}
235
236
237	sub print_txt_usage
238	{
239	my $self = shift(@_);
240
241	# Print the usage message for a plugout (recursively)
242	my $descoffset = $self->determine_description_offset(0);
243	$self->print_plugout_usage($descoffset, 1);
244	}
245
246	sub determine_description_offset
247	{
248	my $self = shift(@_);
249	my $maxoffset = shift(@_);
250
251	my $optionlistref = $self->{'option_list'};
252	my @optionlist = @$optionlistref;
253	my $plugoutoptions = pop(@$optionlistref);
254	return $maxoffset if (!defined($plugoutoptions));
255
256	# Find the length of the longest option string of this download
257	my $plugoutargs = $plugoutoptions->{'args'};
258	if (defined($plugoutargs)) {
259	my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
260	if ($longest > $maxoffset) {
261	$maxoffset = $longest;
262	}
263	}
264
265	# Recurse up the download hierarchy
266	$maxoffset = $self->determine_description_offset($maxoffset);
267	$self->{'option_list'} = \@optionlist;
268	return $maxoffset;
269	}
270
271
272	sub print_plugout_usage
273	{
274	my $self = shift(@_);
275	my $descoffset = shift(@_);
276	my $isleafclass = shift(@_);
277
278	my $optionlistref = $self->{'option_list'};
279	my @optionlist = @$optionlistref;
280	my $plugoutoptions = shift(@$optionlistref);
281	return if (!defined($plugoutoptions));
282
283	my $plugoutname = $plugoutoptions->{'name'};
284	my $plugoutargs = $plugoutoptions->{'args'};
285	my $plugoutdesc = $plugoutoptions->{'desc'};
286
287	# Produce the usage information using the data structure above
288	if ($isleafclass) {
289	if (defined($plugoutdesc)) {
290	gsprintf(STDERR, "$plugoutdesc\n\n");
291	}
292	gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
293	}
294
295	# Display the download options, if there are some
296	if (defined($plugoutargs)) {
297	# Calculate the column offset of the option descriptions
298	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
299
300	if ($isleafclass) {
301	gsprintf(STDERR, " {common.specific_options}:\n");
302	}
303	else {
304	gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
305	}
306
307	# Display the download options
308	&PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
309	}
310
311	# Recurse up the download hierarchy
312	$self->print_plugout_usage($descoffset, 0);
313	$self->{'option_list'} = \@optionlist;
314	}
315
316
317	sub error
318	{
319	my ($strFunctionName,$strError) = @_;
320	{
321	print "Error occoured in BasePlugout.pm\n".
322	"In Function: ".$strFunctionName."\n".
323	"Error Message: ".$strError."\n";
324	exit(-1);
325	}
326	}
327
328	# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
329	sub set_OIDtype {
330	my $self = shift (@_);
331	my ($type, $metadata) = @_;
332
333	if ($type =~ /^(hash\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
334	$self->{'OIDtype'} = $type;
335	} else {
336	$self->{'OIDtype'} = "hash";
337	}
338	if ($type =~ /^assigned$/) {
339	if (defined $metadata) {
340	$self->{'OIDmetadata'} = $metadata;
341	} else {
342	$self->{'OIDmetadata'} = "dc.Identifier";
343	}
344	}
345	}
346
347	sub set_output_dir
348	{
349	my $self = shift @_;
350	my ($output_dir) = @_;
351
352	$self->{'output_dir'} = $output_dir;
353	}
354
355	sub setoutputdir
356	{
357	my $self = shift @_;
358	my ($output_dir) = @_;
359
360	$self->{'output_dir'} = $output_dir;
361	}
362
363	sub get_output_dir
364	{
365	my $self = shift (@_);
366
367	return $self->{'output_dir'};
368	}
369
370	sub getoutputdir
371	{
372	my $self = shift (@_);
373
374	return $self->{'output_dir'};
375	}
376
377	sub getoutputinfo
378	{
379	my $self = shift (@_);
380
381	return $self->{'output_info'};
382	}
383
384
385	sub get_output_handler
386	{
387	my $self = shift (@_);
388
389	my ($output_file_name) = @_;
390
391	my $fh;
392	&FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
393
394	return $fh;
395	}
396
397	sub release_output_handler
398	{
399	my $self = shift (@_);
400	my ($outhandler) = @_;
401
402	close($outhandler);
403
404	}
405
406	sub output_xml_header {
407	my $self = shift (@_);
408	my ($handle,$docroot,$nondoctype) = @_;
409
410
411	#print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
412
413	#For Dspace must be UTF in lower case
414	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
415
416	if (!defined $nondoctype){
417	my $doctype = (defined $docroot) ? $docroot : "Section";
418
419	# Used to be '<!DOCTYPE Archive SYSTEM ...'
420
421	print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
422	}
423
424	print $handle "<$docroot>\n" if defined $docroot;
425	}
426
427	sub output_xml_footer {
428	my $self = shift (@_);
429	my ($handle,$docroot) = @_;
430	print $handle "</$docroot>\n" if defined $docroot;
431	}
432
433
434	sub output_general_xml_header
435	{
436	my $self = shift (@_);
437	my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
438
439	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
440
441	if (defined $opt_dtd) {
442	my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
443	print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
444	}
445
446	if (defined $docroot) {
447	my $full_docroot = $docroot;
448	if (defined $opt_attributes) {
449	$full_docroot .= " $opt_attributes";
450	}
451
452	print $handle "<$full_docroot>\n"
453	}
454	}
455
456	sub output_general_xml_footer
457	{
458	output_xml_footer(@_);
459	}
460
461	# This is called by the plugins after read_into_doc_obj generates the doc_obj.
462	sub process {
463	my $self = shift (@_);
464	my ($doc_obj) = @_;
465
466	my $output_info = $self->{'output_info'};
467	return if (!defined $output_info);
468
469	# for OAI purposes
470	$doc_obj->set_lastmodified();
471	$doc_obj->set_oailastmodified();
472
473	# find out which directory to save to
474	my $doc_dir = "";
475	if ($self->is_group()) {
476	$doc_dir = $self->get_group_doc_dir($doc_obj);
477	} else {
478	$doc_dir = $self->get_doc_dir($doc_obj);
479	}
480
481	##############################
482	# call subclass' saveas method
483	##############################
484	$self->saveas($doc_obj,$doc_dir);
485
486	# write out data to archiveinf-doc.db
487	if ($self->{'generate_databases'}) {
488	$self->archiveinf_db($doc_obj);
489	}
490	if ($self->is_group()) {
491	$self->{'gs_count'}++; # do we want this for all cases?
492	$self->{'group_position'}++;
493	}
494	}
495
496	sub store_output_info_reference {
497	my $self = shift (@_);
498	my ($doc_obj) = @_;
499
500	my $output_info = $self->{'output_info'};
501	my $metaname = $self->{'sortmeta'};
502
503	my $group_position;
504	if ($self->is_group()) {
505	$group_position = $self->{'group_position'};
506	}
507	if (!defined $metaname \|\| $metaname !~ /\S/) {
508	my $OID = $doc_obj->get_OID();
509	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
510	return;
511	}
512
513	if ($metaname eq "OID") { # sort by OID
514	my $OID = $doc_obj->get_OID();
515	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
516	return;
517	}
518
519	my $metadata = "";
520	my $top_section = $doc_obj->get_top_section();
521
522	my @commameta_list = split(/,/, $metaname);
523	foreach my $cmn (@commameta_list) {
524	my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
525	if ($meta) {
526	# do remove prefix/suffix - this will apply to all values
527	$meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
528	$meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
529	$meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
530	$metadata .= $meta if ($meta);
531	}
532	}
533
534	# store reference in the output_info
535	$output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
536
537	}
538
539
540
541	sub saveas {
542	my $self = shift (@_);
543	my ($doc_obj, $doc_dir) = @_;
544
545	die "Basplug::saveas function must be implemented in sub classes\n";
546	}
547
548	sub get_group_doc_dir {
549	my $self = shift (@_);
550	my ($doc_obj) = @_;
551
552	my $outhandle = $self->{'output_handle'};
553	my $OID = $doc_obj->get_OID();
554	$OID = "NULL" unless defined $OID;
555
556	my $groupsize = $self->{'group_size'};
557	my $gs_count = $self->{'gs_count'};
558	my $open_new_file = (($gs_count % $groupsize)==0);
559
560	my $doc_dir;
561
562	if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
563	# if we have some assoc files, then we will need to start a new file
564	if ($self->{'verbosity'} > 2) {
565	print $outhandle " Starting a archives folder for $OID as it has associated files\n";
566	}
567	$open_new_file = 1;
568	}
569
570	# opening a new file
571	if (($open_new_file) \|\| !defined($self->{'gs_doc_dir'})) {
572	# first we close off the old output
573	if ($gs_count>0)
574	{
575	return if (!$self->close_group_output());
576	}
577
578	# this will create the directory
579	$doc_dir = $self->get_doc_dir ($doc_obj);
580	$self->{'new_doc_dir'} = 1;
581	$self->{'gs_doc_dir'} = $doc_dir;
582	$self->{'group_position'} = 1;
583	}
584	else {
585	$doc_dir = $self->{'gs_doc_dir'};
586	$self->{'new_doc_dir'} = 0;
587	}
588	return $doc_dir;
589
590	}
591	sub get_doc_dir {
592
593	my $self = shift (@_);
594	my ($doc_obj) = @_;
595
596	my $OID = $doc_obj->get_OID();
597	$OID = "NULL" unless defined $OID;
598
599	my $working_dir = $self->get_output_dir();
600	my $working_info = $self->{'output_info'};
601	return if (!defined $working_info);
602
603	my $doc_info = $working_info->get_info($OID);
604	my $doc_dir = '';
605
606	if (defined $doc_info && scalar(@$doc_info) >= 1)
607	{
608	# This OID already has an archives directory, so use it again
609	$doc_dir = $doc_info->[0];
610	$doc_dir =~ s/\/?((doc(mets)?)\|(dublin_core))\.xml(\.gz)?$//;
611	}
612	elsif ($self->{'keep_import_structure'})
613	{
614	my $source_filename = $doc_obj->get_source_filename();
615	$source_filename = &File::Basename::dirname($source_filename);
616	$source_filename =~ s/[\\\/]+/\//g;
617	$source_filename =~ s/\/$//;
618
619	$doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
620	}
621
622	# We have to use a new archives directory for this document
623	if ($doc_dir eq "")
624	{
625	$doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
626	}
627
628	&FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
629
630	return $doc_dir;
631	}
632
633
634	## @function get_new_doc_dir()
635	#
636	# Once a doc object is ready to write to disk (and hence has a nice OID),
637	# generate a unique subdirectory to write the information to.
638	#
639	# - create the directory as part of this call, to try and avoid race conditions
640	# found in parallel processing [jmt12]
641	#
642	# @todo figure out what the rule regarding $work_info->size() is meant to do
643	#
644	# @todo determine what $self->{'group'} is, and whether it should affect
645	# directory creation
646	#
647	sub get_new_doc_dir
648	{
649	my $self = shift (@_);
650	my($working_info,$working_dir,$OID) = @_;
651
652	my $doc_dir = "";
653	my $doc_dir_rest = $OID;
654
655	# remove any \ and / from the OID
656	$doc_dir_rest =~ s/[\\\/]//g;
657
658	# Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
659	if ($ENV{'GSDLOS'} =~ /^windows$/i)
660	{
661	$doc_dir_rest =~ s/\://g;
662	}
663
664	# we generally create a unique directory by adding consequtive fragments of
665	# the document identifier (split by some predefined length - defaulting to
666	# 8) until we find a directory that doesn't yet exist. Note that directories
667	# that contain a document have a suffix ".dir" (whereas those that contain
668	# only subdirectories have no suffix).
669	my $doc_dir_num = 0; # how many directories deep we are
670	my $created_directory = 0; # have we successfully created a new directory
671	do
672	{
673	# (does this work on windows? - jmt12)
674	if ($doc_dir_num > 0)
675	{
676	$doc_dir .= '/';
677	}
678
679	# the default matching pattern grabs the next 'subdir_split_length'
680	# characters of the OID to act as the next subdirectory
681	my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
682
683	# Do we count any "HASH" prefix against the split length limit?
684	if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
685	{
686	$pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
687	}
688
689	# Note the use of 's' to both capture the next chuck of OID and to remove
690	# it from OID at the same time
691	if ($doc_dir_rest =~ s/$pattern//i)
692	{
693	$doc_dir .= $1;
694	$doc_dir_num++;
695
696	my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
697	if(!FileUtils::directoryExists($full_doc_dir))
698	{
699	&FileUtils::makeAllDirectories($full_doc_dir);
700	$created_directory = 1;
701	}
702
703	###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
704	###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
705	###rint STDERR " - rest: $doc_dir_rest\n";
706	###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
707	###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
708	}
709	}
710	while ($doc_dir_rest ne '' && ($created_directory == 0 \|\| ($working_info->size() >= 1024 && $doc_dir_num < 2)));
711
712	# not unique yet? Add on an incremental suffix until we are unique
713	my $i = 1;
714	my $doc_dir_base = $doc_dir;
715	while ($created_directory == 0)
716	{
717	$doc_dir = $doc_dir_base . '-' . $i;
718	$created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
719	$i++;
720	}
721
722	# in theory this should never happen
723	if (!$created_directory)
724	{
725	die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
726	}
727
728	return $doc_dir . '.dir';
729	}
730	## get_new_doc_dir()
731
732
733	sub process_assoc_files {
734	my $self = shift (@_);
735	my ($doc_obj, $doc_dir, $handle) = @_;
736
737	my $outhandle = $self->{'output_handle'};
738
739	my $output_dir = $self->get_output_dir();
740	return if (!defined $output_dir);
741
742	&FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
743
744	my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
745	&FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
746
747	my @assoc_files = ();
748	my $filename;;
749
750	my $source_filename = $doc_obj->get_source_filename();
751
752	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
753
754	if (defined $collect_dir) {
755	my $dirsep_regexp = &util::get_os_dirsep();
756
757	if ($collect_dir !~ /$dirsep_regexp$/) {
758	$collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
759	}
760
761	# This test is never going to fail on Windows -- is this a problem?
762
763	if ($source_filename !~ /^$dirsep_regexp/) {
764	$source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
765	}
766	}
767
768
769	# set the assocfile path (even if we have no assoc files - need this for lucene)
770	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
771	"assocfilepath",
772	"$doc_dir");
773	foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
774	my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
775	$dir = "" unless defined $dir;
776
777	my $utf8_real_filename = $assoc_file_rec->[0];
778
779	# for some reasons the image associate file has / before the full path
780	$utf8_real_filename =~ s/^\\(.*)/$1/i;
781
782	## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
783	my $real_filename = $utf8_real_filename;
784	$real_filename = &util::downgrade_if_dos_filename($real_filename);
785
786	if (&FileUtils::fileExists($real_filename)) {
787
788	$filename = &FileUtils::filenameConcatenate($working_dir, $afile);
789
790	&FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
791
792	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
793	"gsdlassocfile",
794	"$afile:$assoc_file_rec->[2]:$dir");
795	} elsif ($self->{'verbosity'} > 1) {
796	print $outhandle "BasePlugout::process couldn't copy the associated file " .
797	"$real_filename to $afile\n";
798	}
799	}
800	}
801
802
803	sub process_metafiles_metadata
804	{
805	my $self = shift (@_);
806	my ($doc_obj) = @_;
807
808	my $top_section = $doc_obj->get_top_section();
809	my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
810
811	foreach my $metafile_pair (@$metafiles) {
812	my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
813
814	$doc_obj->metadata_file($full_metafile,$metafile);
815	}
816
817	$doc_obj->delete_metadata($top_section,"gsdlmetafile");
818	}
819
820	sub archiveinf_files_to_field
821	{
822	my $self = shift(@_);
823	my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
824
825	foreach my $file_rec (@$files) {
826	my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
827	my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
828	# for some reasons the image associate file has / before the full path
829	$real_filename =~ s/^\\(.*)/$1/i;
830
831	my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
832
833	if (&FileUtils::fileExists($raw_filename)) {
834
835	# if (defined $collect_dir) {
836	# my $collect_dir_re_safe = $collect_dir;
837	# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
838	# $collect_dir_re_safe =~ s/\./\\./g;##
839
840	# $real_filename =~ s/^$collect_dir_re_safe//;
841	# }
842
843	if (defined $reverse_lookups) {
844	$reverse_lookups->{$real_filename} = 1;
845	}
846
847	if($field =~ m@assoc-file\|src-file\|meta-file@) {
848	$raw_filename = &util::abspath_to_placeholders($raw_filename);
849	}
850
851	### push(@{$oid_files->{$field}},$full_file);
852	push(@{$oid_files->{$field}},$raw_filename);
853	}
854	else {
855	print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
856	}
857	}
858	}
859
860	sub archiveinf_db
861	{
862	my $self = shift (@_);
863	my ($doc_obj) = @_;
864
865	my $verbosity = $self->{'verbosity'};
866
867	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
868	if (defined $collect_dir) {
869	my $dirsep_regexp = &util::get_os_dirsep();
870
871	if ($collect_dir !~ /$dirsep_regexp$/) {
872	# ensure there is a slash at the end
873	$collect_dir .= &util::get_dirsep();
874	}
875	}
876
877	my $oid = $doc_obj->get_OID();
878	my $source_filename = $doc_obj->get_unmodified_source_filename();
879	my $working_info = $self->{'output_info'};
880	my $doc_info = $working_info->get_info($oid);
881
882	my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
883	# doc_file is the path to the archive doc.xml. Make sure it has unix
884	# slashes, then if the collection is copied to linux, it can be built without reimport
885	$doc_file =~ s/\\/\//g;
886	my $oid_files = { 'doc-file' => $doc_file,
887	'index-status' => $index_status,
888	'src-file' => $source_filename,
889	'sort-meta' => $sortmeta,
890	'assoc-file' => [],
891	'meta-file' => [] };
892	if (defined $group_position) {
893	$oid_files->{'group-position'} = $group_position;
894	}
895	my $reverse_lookups = { $source_filename => "1" };
896
897
898	$self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
899	$collect_dir,$oid_files,$reverse_lookups);
900
901
902	$self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
903	$collect_dir,$oid_files);
904
905	# Get the infodbtype value for this collection from the arcinfo object
906	my $infodbtype = $self->{'output_info'}->{'infodbtype'};
907	my $output_dir = $self->{'output_dir'};
908
909	my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
910
911	##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
912
913	if (!$self->{'no_rss'})
914	{
915	if (($oid_files->{'index-status'} eq "I") \|\| ($oid_files->{'index-status'} eq "R")) {
916	my $top_section = $doc_obj->get_top_section();
917
918	# rss_title can be set in collect.cfg as follows:
919	# plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
920	# rss_title is a semi-colon or comma-separated list of the metadata field names that should
921	# be consulted in order to obtain a Title (anchor text) for the RSS document link.
922	# If not specified, rss_title will default to dc.Title, and fall back on Untitled
923	my $metafieldnames = $self->{'rss_title'};
924	my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
925	my $titles;
926	#@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
927	foreach my $metafieldname (@metafieldarray) {
928	$metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
929	$titles = $doc_obj->get_metadata($top_section,$metafieldname);
930
931	if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
932	last; # break out of the loop
933	}
934	}
935
936	# if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
937	# otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
938	if(scalar(@$titles) == 0) { #&& $metafieldnames !~ [email protected]@) {
939	push(@$titles, "Untitled");
940	}
941
942	# encode basic html entities like <>"& in the title(s), since the & char can break RSS links
943	for (my $i = 0; $i < scalar(@$titles); $i++) {
944	&ghtml::htmlsafe(@$titles[$i]);
945	}
946
947	my $dc_title = join("; ", @$titles);
948
949	if ($oid_files->{'index-status'} eq "R") {
950	$dc_title .= " (Updated)";
951	}
952
953	my $rss_entry = "<item>\n";
954	$rss_entry .= " <title>$dc_title</title>\n";
955	if(&util::is_gs3()) {
956	$rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
957	} else {
958	$rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
959	}
960	$rss_entry .= "</item>";
961
962	if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
963	{
964	my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
965	my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
966	&dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
967	&dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
968	}
969	else
970	{
971	my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
972	my $rss_fh;
973	if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
974	{
975	print $rss_fh $rss_entry . "\n";
976	&FileUtils::closeFileHandle($rss_filename, \$rss_fh);
977	}
978	else
979	{
980	print STDERR "**** Failed to open $rss_filename\n$!\n";
981	}
982	}
983	}
984	}
985
986	$oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
987	$oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
988	$oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
989	$oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
990	$oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
991	if (defined $oid_files->{'group-position'}) {
992	$oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
993	}
994
995	my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
996	&dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
997	&dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
998
999	foreach my $rl (keys %$reverse_lookups) {
1000	$working_info->add_reverseinfo($rl,$oid);
1001	}
1002
1003	# meta files not set in reverese entry, but need to set the metadata flag
1004	if (defined $doc_obj->get_meta_files()) {
1005	foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1006	my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1007	$working_info->set_meta_file_flag($full_file);
1008	}
1009	}
1010	}
1011
1012
1013	sub set_sortmeta {
1014	my $self = shift (@_);
1015	my ($sortmeta, $removeprefix, $removesuffix) = @_;
1016
1017	$self->{'sortmeta'} = $sortmeta;
1018	if (defined ($removeprefix) && $removeprefix ) {
1019	$removeprefix =~ s/^\^//; # don't need a leading ^
1020	$self->{'removeprefix'} = $removeprefix;
1021	}
1022	if (defined ($removesuffix) && $removesuffix) {
1023	$removesuffix =~ s/\$$//; # don't need a trailing $
1024	$self->{'removesuffix'} = $removesuffix;
1025	}
1026	}
1027
1028
1029
1030	sub open_xslt_pipe
1031	{
1032	my $self = shift @_;
1033	my ($output_file_name, $xslt_file)=@_;
1034
1035	return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1036
1037	my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1038
1039	my $mapping_file_path = "";
1040
1041	if ($ENV{'GSDLOS'} eq "windows"){
1042	$java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1043	# this file:/// bit didn't work for me on windows XP
1044	#$xslt_file = "\"file:///".$xslt_file."\"";
1045	#$mapping_file_path = "\"file:///";
1046	}
1047	else{
1048	$java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1049	}
1050
1051
1052	$java_class_path = "\"".$java_class_path."\"";
1053
1054	my $cmd = "\| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1055
1056	if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1057	my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1058	$cmd .= "-m $mapping_file_path";
1059	}
1060
1061	open(*XMLWRITER, $cmd)
1062	or die "can't open pipe to xslt: $!";
1063
1064
1065	$self->{'xslt_writer'} = *XMLWRITER;
1066
1067	print XMLWRITER "<?DocStart?>\n";
1068	print XMLWRITER "$output_file_name\n";
1069
1070
1071	}
1072
1073
1074	sub close_xslt_pipe
1075	{
1076	my $self = shift @_;
1077
1078
1079	return unless defined $self->{'xslt_writer'} ;
1080
1081	my $xsltwriter = $self->{'xslt_writer'};
1082
1083	print $xsltwriter "<?DocEnd?>\n";
1084	close($xsltwriter);
1085
1086	undef $self->{'xslt_writer'};
1087
1088	}
1089
1090
1091
1092	#the subclass should implement this method if is_group method could return 1.
1093	sub close_group_output{
1094	my $self = shift (@_);
1095	}
1096
1097	sub is_group {
1098	my $self = shift (@_);
1099	return 0;
1100	}
1101
1102	my $dc_set = { Title => 1,
1103	Creator => 1,
1104	Subject => 1,
1105	Description => 1,
1106	Publisher => 1,
1107	Contributor => 1,
1108	Date => 1,
1109	Type => 1,
1110	Format => 1,
1111	Identifier => 1,
1112	Source => 1,
1113	Language => 1,
1114	Relation => 1,
1115	Coverage => 1,
1116	Rights => 1};
1117
1118
1119	# returns an XML representation of the dublin core metadata
1120	# if dc meta is not found, try ex meta
1121	# This method is not used by the DSpacePlugout, which has its
1122	# own method to save its dc metadata
1123	sub get_dc_metadata {
1124	my $self = shift(@_);
1125	my ($doc_obj, $section, $version) = @_;
1126
1127	# build up string of dublin core metadata
1128	$section="" unless defined $section;
1129
1130	my $section_ptr = $doc_obj->_lookup_section($section);
1131	return "" unless defined $section_ptr;
1132
1133
1134	my $explicit_dc = {};
1135	my $explicit_ex_dc = {};
1136	my $explicit_ex = {};
1137
1138	my $all_text="";
1139
1140	# We want high quality dc metadata to go in first, so we store all the
1141	# assigned dc.* values first. Then, for all those dc metadata names in
1142	# the official dc set that are as yet unassigned, we look to see whether
1143	# embedded ex.dc.* metadata has defined some values for them. If not,
1144	# then for the same missing dc metadata names, we look in ex metadata.
1145
1146	foreach my $data (@{$section_ptr->{'metadata'}}){
1147	my $escaped_value = &docprint::escape_text($data->[1]);
1148	if ($data->[0]=~ m/^dc\./) {
1149	$data->[0] =~ tr/[A-Z]/[a-z]/;
1150
1151	$data->[0] =~ m/^dc\.(.*)/;
1152	my $dc_element = $1;
1153
1154	if (!defined $explicit_dc->{$dc_element}) {
1155	$explicit_dc->{$dc_element} = [];
1156	}
1157	push(@{$explicit_dc->{$dc_element}},$escaped_value);
1158
1159	if (defined $version && ($version eq "oai_dc")) {
1160	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1161	}
1162	else {
1163	# qualifier???
1164	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1165	}
1166
1167	} elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1168	$data->[0] =~ m/^ex\.dc\.(.*)/;
1169	my $ex_dc_element = $1;
1170	my $lc_ex_dc_element = lc($ex_dc_element);
1171
1172	# only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1173	if (defined $dc_set->{$ex_dc_element}) {
1174	if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1175	$explicit_ex_dc->{$lc_ex_dc_element} = [];
1176	}
1177	push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1178	}
1179	}
1180	elsif (($data->[0] =~ m/^ex\./) \|\| ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1181	$data->[0] =~ m/^(ex\.)?(.*)/;
1182	my $ex_element = $2;
1183	my $lc_ex_element = lc($ex_element);
1184
1185	if (defined $dc_set->{$ex_element}) {
1186	if (!defined $explicit_ex->{$lc_ex_element}) {
1187	$explicit_ex->{$lc_ex_element} = [];
1188	}
1189	push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1190	}
1191	}
1192	}
1193
1194	# go through dc_set and for any element not defined in explicit_dc
1195	# that does exist in explicit_ex, add it in as metadata
1196	foreach my $k ( keys %$dc_set ) {
1197	my $lc_k = lc($k);
1198
1199	if (!defined $explicit_dc->{$lc_k}) {
1200	# try to find if ex.dc.* defines this dc.* meta,
1201	# if not, then look for whether there's an ex.* equivalent
1202
1203	if (defined $explicit_ex_dc->{$lc_k}) {
1204	foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1205	my $dc_element = $lc_k;
1206	my $escaped_value = $v;
1207
1208	if (defined $version && ($version eq "oai_dc")) {
1209	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1210	}
1211	else {
1212	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1213	}
1214	}
1215	} elsif (defined $explicit_ex->{$lc_k}) {
1216	foreach my $v (@{$explicit_ex->{$lc_k}}) {
1217	my $dc_element = $lc_k;
1218	my $escaped_value = $v;
1219
1220	if (defined $version && ($version eq "oai_dc")) {
1221	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1222	}
1223	else {
1224	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1225	}
1226	}
1227	}
1228	}
1229	}
1230
1231	if ($all_text eq "") {
1232	$all_text .= " There is no Dublin Core metatdata in this document\n";
1233	}
1234	$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1235
1236	return $all_text;
1237	}
1238
1239	# Build up dublin_core metadata. Priority given to dc.* over ex.*
1240	# This method was apparently added by Jeffrey and committed by Shaoqun.
1241	# But we don't know why it was added, so not using it anymore.
1242	sub new_get_dc_metadata {
1243
1244	my $self = shift(@_);
1245	my ($doc_obj, $section, $version) = @_;
1246
1247	# build up string of dublin core metadata
1248	$section="" unless defined $section;
1249
1250	my $section_ptr=$doc_obj->_lookup_section($section);
1251	return "" unless defined $section_ptr;
1252
1253	my $all_text = "";
1254	foreach my $data (@{$section_ptr->{'metadata'}}){
1255	my $escaped_value = &docprint::escape_text($data->[1]);
1256	my $dc_element = $data->[0];
1257
1258	my @array = split('\.',$dc_element);
1259	my ($type,$name);
1260
1261	if(defined $array[1])
1262	{
1263	$type = $array[0];
1264	$name = $array[1];
1265	}
1266	else
1267	{
1268	$type = "ex";
1269	$name = $array[0];
1270	}
1271
1272	$all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1273	}
1274	return $all_text;
1275	}
1276
1277
1278	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: