Context Navigation

source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 32159

Last change on this file since 32159 was 32159, checked in by ak19, 6 years ago
incremental building was not being incremental when no metadata was assigned to any of the files (as happens with our docs for quick test collections). A default metadata.xml is present, but 'empty' in that it contains no FileSet elements with metadata elements assigned to FileName elements. But we still want incremental behaviour. The idea was to write out an entry into archiveinf-src.db for each metadata.xml processed, not just for each meta.xml file actually referencing a doc, as BasePlugout was doing so far on a per doc basis. Kathy come up with the actual infrastructure that can make it work (to ensure all the necessary objects are available), Dr Bainbridge approved this, and it's now been added into the code.
Property svn:keywords set to `Author Date Id Revision`
File size: 37.7 KB

Line
1	###########################################################################
2	#
3	# BasePlugout.pm -- base class for all the plugout modules
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2006 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasePlugout;
27
28	eval {require bytes};
29
30	use strict;
31	no strict 'subs';
32	no strict 'refs';
33
34	use dbutil;
35	use gsprintf 'gsprintf';
36	use printusage;
37	use parse2;
38	use util;
39	use FileUtils;
40	use sorttools;
41
42	# suppress the annoying "subroutine redefined" warning that various
43	# gets cause under perl 5.6
44	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
45
46	my $arguments = [
47	{ 'name' => "xslt_file",
48	'desc' => "{BasPlugout.xslt_file}",
49	'type' => "string",
50	'reqd' => "no",
51	'deft' => "",
52	'hiddengli' => "no"},
53	{ 'name' => "subdir_split_length",
54	'desc' => "{BasPlugout.subdir_split_length}",
55	'type' => "int",
56	'reqd' => "no",
57	'deft' => "8",
58	'hiddengli' => "no"},
59	{ 'name' => "subdir_hash_prefix",
60	'desc' => "{BasPlugout.subdir_hash_prefix}",
61	'type' => "flag",
62	'reqd' => "no",
63	'deft' => "0",
64	'hiddengli' => "no"},
65	{ 'name' => "gzip_output",
66	'desc' => "{BasPlugout.gzip_output}",
67	'type' => "flag",
68	'reqd' => "no",
69	'hiddengli' => "no"},
70	{ 'name' => "verbosity",
71	'desc' => "{BasPlugout.verbosity}",
72	'type' => "int",
73	'deft' => "0",
74	'reqd' => "no",
75	'hiddengli' => "no"},
76	{ 'name' => "output_info",
77	'desc' => "{BasPlugout.output_info}",
78	'type' => "string",
79	'reqd' => "yes",
80	'hiddengli' => "yes"},
81	{ 'name' => "output_handle",
82	'desc' => "{BasPlugout.output_handle}",
83	'type' => "string",
84	'deft' => 'STDERR',
85	'reqd' => "no",
86	'hiddengli' => "yes"},
87	{ 'name' => "debug",
88	'desc' => "{BasPlugout.debug}",
89	'type' => "flag",
90	'reqd' => "no",
91	'hiddengli' => "yes"},
92	{ 'name' => 'no_rss',
93	'desc' => "{BasPlugout.no_rss}",
94	'type' => 'flag',
95	'reqd' => 'no',
96	'hiddengli' => 'yes'},
97	{ 'name' => 'rss_title',
98	'desc' => "{BasPlugout.rss_title}",
99	'type' => 'string',
100	'deft' => 'dc.Title',
101	'reqd' => 'no',
102	'hiddengli' => 'yes'},
103	{ 'name' => "no_auxiliary_databases",
104	'desc' => "{BasPlugout.no_auxiliary_databases}",
105	'type' => "flag",
106	'reqd' => "no",
107	'hiddengli' => "yes"}
108
109	];
110
111	my $options = { 'name' => "BasePlugout",
112	'desc' => "{BasPlugout.desc}",
113	'abstract' => "yes",
114	'inherits' => "no",
115	'args' => $arguments};
116
117	sub new
118	{
119	my $class = shift (@_);
120
121	my ($plugoutlist,$args,$hashArgOptLists) = @_;
122	push(@$plugoutlist, $class);
123
124	my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
125
126	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
127	push(@{$hashArgOptLists->{"OptList"}},$options);
128
129	my $self = {};
130	$self->{'plugout_type'} = $class;
131	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
132	$self->{"info_only"} = 0;
133
134	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
135	# the args, just return the object.
136	foreach my $strArg (@{$args})
137	{
138	if(defined $strArg && $strArg eq "-gsdlinfo")
139	{
140	$self->{"info_only"} = 1;
141	return bless $self, $class;
142	}
143	}
144
145	delete $self->{"info_only"};
146
147	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
148	{
149	my $classTempClass = bless $self, $class;
150	print STDERR "<BadPlugout d=$plugout_name>\n";
151	&gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
152	$classTempClass->print_txt_usage(""); # Use default resource bundle
153	die "\n";
154	}
155
156
157	if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
158	{
159	my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
160	if (!defined $full_file_path) {
161	print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
162	die "\n";
163	}
164	$self->{'xslt_file'} = $full_file_path;
165	}
166
167	# for group processing
168	$self->{'gs_count'} = 0;
169	$self->{'group_position'} = 1;
170
171	$self->{'keep_import_structure'} = 0;
172
173	$self->{'generate_databases'} = 1;
174	if ($self->{'no_auxiliary_databases'}) {
175	$self->{'generate_databases'} = 0;
176	}
177	undef $self->{'no_auxiliary_databases'};
178	return bless $self, $class;
179
180	}
181
182	# implement this in subclass if you want to do some initialization after
183	# loading and setting parameters, and before processing the documents.
184	sub begin {
185
186	my $self= shift (@_);
187
188	}
189	sub print_xml_usage
190	{
191	my $self = shift(@_);
192	my $header = shift(@_);
193	my $high_level_information_only = shift(@_);
194
195	# XML output is always in UTF-8
196	gsprintf::output_strings_in_UTF8;
197
198	if ($header) {
199	&PrintUsage::print_xml_header("plugout");
200	}
201	$self->print_xml($high_level_information_only);
202	}
203
204
205	sub print_xml
206	{
207	my $self = shift(@_);
208	my $high_level_information_only = shift(@_);
209
210	my $optionlistref = $self->{'option_list'};
211	my @optionlist = @$optionlistref;
212	my $plugoutoptions = shift(@$optionlistref);
213	return if (!defined($plugoutoptions));
214
215	gsprintf(STDERR, "<PlugoutInfo>\n");
216	gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
217	my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
218	$desc =~ s/</&lt;/g; # doubly escaped
219	$desc =~ s/>/&gt;/g;
220	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221	gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
222	gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
223	unless (defined($high_level_information_only)) {
224	gsprintf(STDERR, " <Arguments>\n");
225	if (defined($plugoutoptions->{'args'})) {
226	&PrintUsage::print_options_xml($plugoutoptions->{'args'});
227	}
228	gsprintf(STDERR, " </Arguments>\n");
229
230	# Recurse up the plugout hierarchy
231	$self->print_xml();
232	}
233	gsprintf(STDERR, "</PlugoutInfo>\n");
234	}
235
236
237	sub print_txt_usage
238	{
239	my $self = shift(@_);
240
241	# Print the usage message for a plugout (recursively)
242	my $descoffset = $self->determine_description_offset(0);
243	$self->print_plugout_usage($descoffset, 1);
244	}
245
246	sub determine_description_offset
247	{
248	my $self = shift(@_);
249	my $maxoffset = shift(@_);
250
251	my $optionlistref = $self->{'option_list'};
252	my @optionlist = @$optionlistref;
253	my $plugoutoptions = pop(@$optionlistref);
254	return $maxoffset if (!defined($plugoutoptions));
255
256	# Find the length of the longest option string of this download
257	my $plugoutargs = $plugoutoptions->{'args'};
258	if (defined($plugoutargs)) {
259	my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
260	if ($longest > $maxoffset) {
261	$maxoffset = $longest;
262	}
263	}
264
265	# Recurse up the download hierarchy
266	$maxoffset = $self->determine_description_offset($maxoffset);
267	$self->{'option_list'} = \@optionlist;
268	return $maxoffset;
269	}
270
271
272	sub print_plugout_usage
273	{
274	my $self = shift(@_);
275	my $descoffset = shift(@_);
276	my $isleafclass = shift(@_);
277
278	my $optionlistref = $self->{'option_list'};
279	my @optionlist = @$optionlistref;
280	my $plugoutoptions = shift(@$optionlistref);
281	return if (!defined($plugoutoptions));
282
283	my $plugoutname = $plugoutoptions->{'name'};
284	my $plugoutargs = $plugoutoptions->{'args'};
285	my $plugoutdesc = $plugoutoptions->{'desc'};
286
287	# Produce the usage information using the data structure above
288	if ($isleafclass) {
289	if (defined($plugoutdesc)) {
290	gsprintf(STDERR, "$plugoutdesc\n\n");
291	}
292	gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
293	}
294
295	# Display the download options, if there are some
296	if (defined($plugoutargs)) {
297	# Calculate the column offset of the option descriptions
298	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
299
300	if ($isleafclass) {
301	gsprintf(STDERR, " {common.specific_options}:\n");
302	}
303	else {
304	gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
305	}
306
307	# Display the download options
308	&PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
309	}
310
311	# Recurse up the download hierarchy
312	$self->print_plugout_usage($descoffset, 0);
313	$self->{'option_list'} = \@optionlist;
314	}
315
316
317	sub error
318	{
319	my ($strFunctionName,$strError) = @_;
320	{
321	print "Error occoured in BasePlugout.pm\n".
322	"In Function: ".$strFunctionName."\n".
323	"Error Message: ".$strError."\n";
324	exit(-1);
325	}
326	}
327
328	# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
329	sub set_OIDtype {
330	my $self = shift (@_);
331	my ($type, $metadata) = @_;
332
333	if ($type =~ /^(hash\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
334	$self->{'OIDtype'} = $type;
335	} else {
336	$self->{'OIDtype'} = "hash";
337	}
338	if ($type =~ /^assigned$/) {
339	if (defined $metadata) {
340	$self->{'OIDmetadata'} = $metadata;
341	} else {
342	$self->{'OIDmetadata'} = "dc.Identifier";
343	}
344	}
345	}
346
347	sub set_output_dir
348	{
349	my $self = shift @_;
350	my ($output_dir) = @_;
351
352	$self->{'output_dir'} = $output_dir;
353	}
354
355	sub setoutputdir
356	{
357	my $self = shift @_;
358	my ($output_dir) = @_;
359
360	$self->{'output_dir'} = $output_dir;
361	}
362
363	sub get_output_dir
364	{
365	my $self = shift (@_);
366
367	return $self->{'output_dir'};
368	}
369
370	sub getoutputdir
371	{
372	my $self = shift (@_);
373
374	return $self->{'output_dir'};
375	}
376
377	sub getoutputinfo
378	{
379	my $self = shift (@_);
380
381	return $self->{'output_info'};
382	}
383
384
385	sub get_output_handler
386	{
387	my $self = shift (@_);
388
389	my ($output_file_name) = @_;
390
391	my $fh;
392	&FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
393
394	return $fh;
395	}
396
397	sub release_output_handler
398	{
399	my $self = shift (@_);
400	my ($outhandler) = @_;
401
402	close($outhandler);
403
404	}
405
406	sub output_xml_header {
407	my $self = shift (@_);
408	my ($handle,$docroot,$nondoctype) = @_;
409
410
411	#print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
412
413	#For Dspace must be UTF in lower case
414	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
415
416	if (!defined $nondoctype){
417	my $doctype = (defined $docroot) ? $docroot : "Section";
418
419	# Used to be '<!DOCTYPE Archive SYSTEM ...'
420
421	print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
422	}
423
424	print $handle "<$docroot>\n" if defined $docroot;
425	}
426
427	sub output_xml_footer {
428	my $self = shift (@_);
429	my ($handle,$docroot) = @_;
430	print $handle "</$docroot>\n" if defined $docroot;
431	}
432
433
434	sub output_general_xml_header
435	{
436	my $self = shift (@_);
437	my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
438
439	print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
440
441	if (defined $opt_dtd) {
442	my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
443	print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
444	}
445
446	if (defined $docroot) {
447	my $full_docroot = $docroot;
448	if (defined $opt_attributes) {
449	$full_docroot .= " $opt_attributes";
450	}
451
452	print $handle "<$full_docroot>\n"
453	}
454	}
455
456	sub output_general_xml_footer
457	{
458	output_xml_footer(@_);
459	}
460
461	# This is called by the plugins after read_into_doc_obj generates the doc_obj.
462	sub process {
463	my $self = shift (@_);
464	my ($doc_obj) = @_;
465
466	my $output_info = $self->{'output_info'};
467	return if (!defined $output_info);
468
469	# for OAI purposes
470	$doc_obj->set_lastmodified();
471	$doc_obj->set_oailastmodified();
472
473	# find out which directory to save to
474	my $doc_dir = "";
475	if ($self->is_group()) {
476	$doc_dir = $self->get_group_doc_dir($doc_obj);
477	} else {
478	$doc_dir = $self->get_doc_dir($doc_obj);
479	}
480
481	##############################
482	# call subclass' saveas method
483	##############################
484	$self->saveas($doc_obj,$doc_dir);
485
486	# write out data to archiveinf-doc.db
487	if ($self->{'generate_databases'}) {
488	$self->archiveinf_db($doc_obj);
489	}
490	if ($self->is_group()) {
491	$self->{'gs_count'}++; # do we want this for all cases?
492	$self->{'group_position'}++;
493	}
494	}
495
496	sub store_output_info_reference {
497	my $self = shift (@_);
498	my ($doc_obj) = @_;
499
500	my $output_info = $self->{'output_info'};
501	my $metaname = $self->{'sortmeta'};
502
503	my $group_position;
504	if ($self->is_group()) {
505	$group_position = $self->{'group_position'};
506	}
507	if (!defined $metaname \|\| $metaname !~ /\S/) {
508	my $OID = $doc_obj->get_OID();
509	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
510	return;
511	}
512
513	if ($metaname eq "OID") { # sort by OID
514	my $OID = $doc_obj->get_OID();
515	$output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
516	return;
517	}
518
519	my $metadata = "";
520	my $top_section = $doc_obj->get_top_section();
521
522	my @commameta_list = split(/,/, $metaname);
523	foreach my $cmn (@commameta_list) {
524	my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
525	if ($meta) {
526	# do remove prefix/suffix - this will apply to all values
527	$meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
528	$meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
529	$meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
530	$metadata .= $meta if ($meta);
531	}
532	}
533
534	# store reference in the output_info
535	$output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
536
537	}
538
539
540
541	sub saveas {
542	my $self = shift (@_);
543	my ($doc_obj, $doc_dir) = @_;
544
545	die "Basplug::saveas function must be implemented in sub classes\n";
546	}
547
548	sub get_group_doc_dir {
549	my $self = shift (@_);
550	my ($doc_obj) = @_;
551
552	my $outhandle = $self->{'output_handle'};
553	my $OID = $doc_obj->get_OID();
554	$OID = "NULL" unless defined $OID;
555
556	my $groupsize = $self->{'group_size'};
557	my $gs_count = $self->{'gs_count'};
558	my $open_new_file = (($gs_count % $groupsize)==0);
559
560	my $doc_dir;
561
562	if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
563	# if we have some assoc files, then we will need to start a new file
564	if ($self->{'verbosity'} > 2) {
565	print $outhandle " Starting a archives folder for $OID as it has associated files\n";
566	}
567	$open_new_file = 1;
568	}
569
570	# opening a new file
571	if (($open_new_file) \|\| !defined($self->{'gs_doc_dir'})) {
572	# first we close off the old output
573	if ($gs_count>0)
574	{
575	return if (!$self->close_group_output());
576	}
577
578	# this will create the directory
579	$doc_dir = $self->get_doc_dir ($doc_obj);
580	$self->{'new_doc_dir'} = 1;
581	$self->{'gs_doc_dir'} = $doc_dir;
582	$self->{'group_position'} = 1;
583	}
584	else {
585	$doc_dir = $self->{'gs_doc_dir'};
586	$self->{'new_doc_dir'} = 0;
587	}
588	return $doc_dir;
589
590	}
591	sub get_doc_dir {
592
593	my $self = shift (@_);
594	my ($doc_obj) = @_;
595
596	my $OID = $doc_obj->get_OID();
597	$OID = "NULL" unless defined $OID;
598
599	my $working_dir = $self->get_output_dir();
600	my $working_info = $self->{'output_info'};
601	return if (!defined $working_info);
602
603	my $doc_info = $working_info->get_info($OID);
604	my $doc_dir = '';
605
606	if (defined $doc_info && scalar(@$doc_info) >= 1)
607	{
608	# This OID already has an archives directory, so use it again
609	$doc_dir = $doc_info->[0];
610	$doc_dir =~ s/\/?((doc(mets)?)\|(dublin_core))\.xml(\.gz)?$//;
611	}
612	elsif ($self->{'keep_import_structure'})
613	{
614	my $source_filename = $doc_obj->get_source_filename();
615	$source_filename = &File::Basename::dirname($source_filename);
616	$source_filename =~ s/[\\\/]+/\//g;
617	$source_filename =~ s/\/$//;
618
619	$doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
620	}
621
622	# We have to use a new archives directory for this document
623	if ($doc_dir eq "")
624	{
625	$doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
626	}
627
628	&FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
629
630	return $doc_dir;
631	}
632
633
634	## @function get_new_doc_dir()
635	#
636	# Once a doc object is ready to write to disk (and hence has a nice OID),
637	# generate a unique subdirectory to write the information to.
638	#
639	# - create the directory as part of this call, to try and avoid race conditions
640	# found in parallel processing [jmt12]
641	#
642	# @todo figure out what the rule regarding $work_info->size() is meant to do
643	#
644	# @todo determine what $self->{'group'} is, and whether it should affect
645	# directory creation
646	#
647	sub get_new_doc_dir
648	{
649	my $self = shift (@_);
650	my($working_info,$working_dir,$OID) = @_;
651
652	my $doc_dir = "";
653	my $doc_dir_rest = $OID;
654
655	# remove any \ and / from the OID
656	$doc_dir_rest =~ s/[\\\/]//g;
657
658	# Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
659	if ($ENV{'GSDLOS'} =~ /^windows$/i)
660	{
661	$doc_dir_rest =~ s/\://g;
662	}
663
664	# we generally create a unique directory by adding consequtive fragments of
665	# the document identifier (split by some predefined length - defaulting to
666	# 8) until we find a directory that doesn't yet exist. Note that directories
667	# that contain a document have a suffix ".dir" (whereas those that contain
668	# only subdirectories have no suffix).
669	my $doc_dir_num = 0; # how many directories deep we are
670	my $created_directory = 0; # have we successfully created a new directory
671	do
672	{
673	# (does this work on windows? - jmt12)
674	if ($doc_dir_num > 0)
675	{
676	$doc_dir .= '/';
677	}
678
679	# the default matching pattern grabs the next 'subdir_split_length'
680	# characters of the OID to act as the next subdirectory
681	my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
682
683	# Do we count any "HASH" prefix against the split length limit?
684	if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
685	{
686	$pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
687	}
688
689	# Note the use of 's' to both capture the next chuck of OID and to remove
690	# it from OID at the same time
691	if ($doc_dir_rest =~ s/$pattern//i)
692	{
693	$doc_dir .= $1;
694	$doc_dir_num++;
695
696	my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
697	if(!FileUtils::directoryExists($full_doc_dir))
698	{
699	&FileUtils::makeAllDirectories($full_doc_dir);
700	$created_directory = 1;
701	}
702
703	###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
704	###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
705	###rint STDERR " - rest: $doc_dir_rest\n";
706	###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
707	###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
708	}
709	}
710	while ($doc_dir_rest ne '' && ($created_directory == 0 \|\| ($working_info->size() >= 1024 && $doc_dir_num < 2)));
711
712	# not unique yet? Add on an incremental suffix until we are unique
713	my $i = 1;
714	my $doc_dir_base = $doc_dir;
715	while ($created_directory == 0)
716	{
717	$doc_dir = $doc_dir_base . '-' . $i;
718	$created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
719	$i++;
720	}
721
722	# in theory this should never happen
723	if (!$created_directory)
724	{
725	die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
726	}
727
728	return $doc_dir . '.dir';
729	}
730	## get_new_doc_dir()
731
732
733	sub process_assoc_files {
734	my $self = shift (@_);
735	my ($doc_obj, $doc_dir, $handle) = @_;
736
737	my $outhandle = $self->{'output_handle'};
738
739	my $output_dir = $self->get_output_dir();
740	return if (!defined $output_dir);
741
742	&FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
743
744	my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
745	&FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
746
747	my @assoc_files = ();
748	my $filename;;
749
750	my $source_filename = $doc_obj->get_source_filename();
751
752	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
753
754	if (defined $collect_dir) {
755	my $dirsep_regexp = &util::get_os_dirsep();
756
757	if ($collect_dir !~ /$dirsep_regexp$/) {
758	$collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
759	}
760
761	# This test is never going to fail on Windows -- is this a problem?
762
763	if ($source_filename !~ /^$dirsep_regexp/) {
764	$source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
765	}
766	}
767
768
769	# set the assocfile path (even if we have no assoc files - need this for lucene)
770	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
771	"assocfilepath",
772	"$doc_dir");
773	foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
774	my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
775	$dir = "" unless defined $dir;
776
777	my $utf8_real_filename = $assoc_file_rec->[0];
778
779	# for some reasons the image associate file has / before the full path
780	$utf8_real_filename =~ s/^\\(.*)/$1/i;
781
782	## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
783	my $real_filename = $utf8_real_filename;
784	$real_filename = &util::downgrade_if_dos_filename($real_filename);
785
786	if (&FileUtils::fileExists($real_filename)) {
787
788	$filename = &FileUtils::filenameConcatenate($working_dir, $afile);
789
790	&FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
791
792	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
793	"gsdlassocfile",
794	"$afile:$assoc_file_rec->[2]:$dir");
795	} elsif ($self->{'verbosity'} > 1) {
796	print $outhandle "BasePlugout::process couldn't copy the associated file " .
797	"$real_filename to $afile\n";
798	}
799	}
800	}
801
802
803	sub process_metafiles_metadata
804	{
805	my $self = shift (@_);
806	my ($doc_obj) = @_;
807
808	my $top_section = $doc_obj->get_top_section();
809	my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
810
811	foreach my $metafile_pair (@$metafiles) {
812	my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
813
814	$doc_obj->metadata_file($full_metafile,$metafile);
815	}
816
817	$doc_obj->delete_metadata($top_section,"gsdlmetafile");
818	}
819
820	sub archiveinf_files_to_field
821	{
822	my $self = shift(@_);
823	my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
824
825	foreach my $file_rec (@$files) {
826	my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
827	my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
828	# for some reasons the image associate file has / before the full path
829	$real_filename =~ s/^\\(.*)/$1/i;
830
831	my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
832
833	if (&FileUtils::fileExists($raw_filename)) {
834
835	# if (defined $collect_dir) {
836	# my $collect_dir_re_safe = $collect_dir;
837	# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
838	# $collect_dir_re_safe =~ s/\./\\./g;##
839
840	# $real_filename =~ s/^$collect_dir_re_safe//;
841	# }
842
843	if (defined $reverse_lookups) {
844	$reverse_lookups->{$real_filename} = 1;
845	}
846
847	if($field =~ m@assoc-file\|src-file\|meta-file@) {
848	$raw_filename = &util::abspath_to_placeholders($raw_filename);
849	}
850
851	### push(@{$oid_files->{$field}},$full_file);
852	push(@{$oid_files->{$field}},$raw_filename);
853	}
854	else {
855	print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
856	}
857	}
858	}
859
860	sub archiveinf_db
861	{
862	my $self = shift (@_);
863	my ($doc_obj) = @_;
864
865	my $verbosity = $self->{'verbosity'};
866
867	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
868	if (defined $collect_dir) {
869	my $dirsep_regexp = &util::get_os_dirsep();
870
871	if ($collect_dir !~ /$dirsep_regexp$/) {
872	# ensure there is a slash at the end
873	$collect_dir .= &util::get_dirsep();
874	}
875	}
876
877	my $oid = $doc_obj->get_OID();
878	my $source_filename = $doc_obj->get_unmodified_source_filename();
879	my $working_info = $self->{'output_info'};
880	my $doc_info = $working_info->get_info($oid);
881
882	my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
883	# doc_file is the path to the archive doc.xml. Make sure it has unix
884	# slashes, then if the collection is copied to linux, it can be built without reimport
885	$doc_file =~ s/\\/\//g;
886	my $oid_files = { 'doc-file' => $doc_file,
887	'index-status' => $index_status,
888	'src-file' => $source_filename,
889	'sort-meta' => $sortmeta,
890	'assoc-file' => [],
891	'meta-file' => [] };
892	if (defined $group_position) {
893	$oid_files->{'group-position'} = $group_position;
894	}
895	my $reverse_lookups = { $source_filename => "1" };
896
897
898	$self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
899	$collect_dir,$oid_files,$reverse_lookups);
900
901
902	$self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
903	$collect_dir,$oid_files);
904
905	# Get the infodbtype value for this collection from the arcinfo object
906	my $infodbtype = $self->{'output_info'}->{'infodbtype'};
907	my $output_dir = $self->{'output_dir'};
908
909	my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
910
911	##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
912
913	if (!$self->{'no_rss'})
914	{
915	if (($oid_files->{'index-status'} eq "I") \|\| ($oid_files->{'index-status'} eq "R")) {
916	my $top_section = $doc_obj->get_top_section();
917
918	# rss_title can be set in collect.cfg as follows:
919	# plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
920	# rss_title is a semi-colon or comma-separated list of the metadata field names that should
921	# be consulted in order to obtain a Title (anchor text) for the RSS document link.
922	# If not specified, rss_title will default to dc.Title, and fall back on Untitled
923	my $metafieldnames = $self->{'rss_title'};
924	my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
925	my $titles;
926	#@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
927	foreach my $metafieldname (@metafieldarray) {
928	$metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
929	$titles = $doc_obj->get_metadata($top_section,$metafieldname);
930
931	if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
932	last; # break out of the loop
933	}
934	}
935
936	# if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
937	# otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
938	if(scalar(@$titles) == 0) { #&& $metafieldnames !~ [email protected]@) {
939	push(@$titles, "Untitled");
940	}
941
942	# encode basic html entities like <>"& in the title(s), since the & char can break RSS links
943	for (my $i = 0; $i < scalar(@$titles); $i++) {
944	&ghtml::htmlsafe(@$titles[$i]);
945	}
946
947	my $dc_title = join("; ", @$titles);
948
949	if ($oid_files->{'index-status'} eq "R") {
950	$dc_title .= " (Updated)";
951	}
952
953	my $rss_entry = "<item>\n";
954	$rss_entry .= " <title>$dc_title</title>\n";
955	if(&util::is_gs3()) {
956	$rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
957	} else {
958	$rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
959	}
960	$rss_entry .= "</item>";
961
962	if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
963	{
964	my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
965	my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
966	&dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
967	&dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
968	}
969	else
970	{
971	my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
972	my $rss_fh;
973	if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
974	{
975	print $rss_fh $rss_entry . "\n";
976	&FileUtils::closeFileHandle($rss_filename, \$rss_fh);
977	}
978	else
979	{
980	print STDERR "**** Failed to open $rss_filename\n$!\n";
981	}
982	}
983	}
984	}
985
986	$oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
987	$oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
988	$oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
989	$oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
990	$oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
991	if (defined $oid_files->{'group-position'}) {
992	$oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
993	}
994
995	my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
996	&dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
997	&dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
998
999	foreach my $rl (keys %$reverse_lookups) {
1000	$working_info->add_reverseinfo($rl,$oid);
1001	}
1002
1003	# meta files not set in reverse entry, but need to set the metadata flag
1004	if (defined $doc_obj->get_meta_files()) {
1005	foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1006	my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1007	$working_info->set_meta_file_flag($full_file);
1008	}
1009	}
1010	}
1011
1012	# This sub is called for every metadata.xml accepted for processing by by MetdataXMLPlugin
1013	# and adds an entry into archiveinf-src.db for that file in the form:
1014	# [@THISCOLLECTPATH@/import/metadata.xml]
1015	# <meta-file>1
1016	# This prevents blind reprocessing of the same old docs upon incremental building whenever
1017	# we encounter a default empty metadata.xml that has no actual <FileSet> content defined.
1018	sub add_metaxml_file_entry_to_archiveinfsrc {
1019	my $self = shift (@_);
1020	my ($full_file) = @_;
1021
1022	print STDERR "**** Adding metaxml file entry for full_file: $full_file\n";
1023	my $working_info = $self->{'output_info'};
1024	$working_info->set_meta_file_flag($full_file);
1025	}
1026
1027
1028	sub set_sortmeta {
1029	my $self = shift (@_);
1030	my ($sortmeta, $removeprefix, $removesuffix) = @_;
1031
1032	$self->{'sortmeta'} = $sortmeta;
1033	if (defined ($removeprefix) && $removeprefix ) {
1034	$removeprefix =~ s/^\^//; # don't need a leading ^
1035	$self->{'removeprefix'} = $removeprefix;
1036	}
1037	if (defined ($removesuffix) && $removesuffix) {
1038	$removesuffix =~ s/\$$//; # don't need a trailing $
1039	$self->{'removesuffix'} = $removesuffix;
1040	}
1041	}
1042
1043
1044
1045	sub open_xslt_pipe
1046	{
1047	my $self = shift @_;
1048	my ($output_file_name, $xslt_file)=@_;
1049
1050	return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1051
1052	my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1053
1054	my $mapping_file_path = "";
1055
1056	if ($ENV{'GSDLOS'} eq "windows"){
1057	$java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1058	# this file:/// bit didn't work for me on windows XP
1059	#$xslt_file = "\"file:///".$xslt_file."\"";
1060	#$mapping_file_path = "\"file:///";
1061	}
1062	else{
1063	$java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1064	}
1065
1066
1067	$java_class_path = "\"".$java_class_path."\"";
1068
1069	my $cmd = "\| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1070
1071	if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1072	my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1073	$cmd .= "-m $mapping_file_path";
1074	}
1075
1076	open(*XMLWRITER, $cmd)
1077	or die "can't open pipe to xslt: $!";
1078
1079
1080	$self->{'xslt_writer'} = *XMLWRITER;
1081
1082	print XMLWRITER "<?DocStart?>\n";
1083	print XMLWRITER "$output_file_name\n";
1084
1085
1086	}
1087
1088
1089	sub close_xslt_pipe
1090	{
1091	my $self = shift @_;
1092
1093
1094	return unless defined $self->{'xslt_writer'} ;
1095
1096	my $xsltwriter = $self->{'xslt_writer'};
1097
1098	print $xsltwriter "<?DocEnd?>\n";
1099	close($xsltwriter);
1100
1101	undef $self->{'xslt_writer'};
1102
1103	}
1104
1105
1106
1107	#the subclass should implement this method if is_group method could return 1.
1108	sub close_group_output{
1109	my $self = shift (@_);
1110	}
1111
1112	sub is_group {
1113	my $self = shift (@_);
1114	return 0;
1115	}
1116
1117	my $dc_set = { Title => 1,
1118	Creator => 1,
1119	Subject => 1,
1120	Description => 1,
1121	Publisher => 1,
1122	Contributor => 1,
1123	Date => 1,
1124	Type => 1,
1125	Format => 1,
1126	Identifier => 1,
1127	Source => 1,
1128	Language => 1,
1129	Relation => 1,
1130	Coverage => 1,
1131	Rights => 1};
1132
1133
1134	# returns an XML representation of the dublin core metadata
1135	# if dc meta is not found, try ex meta
1136	# This method is not used by the DSpacePlugout, which has its
1137	# own method to save its dc metadata
1138	sub get_dc_metadata {
1139	my $self = shift(@_);
1140	my ($doc_obj, $section, $version) = @_;
1141
1142	# build up string of dublin core metadata
1143	$section="" unless defined $section;
1144
1145	my $section_ptr = $doc_obj->_lookup_section($section);
1146	return "" unless defined $section_ptr;
1147
1148
1149	my $explicit_dc = {};
1150	my $explicit_ex_dc = {};
1151	my $explicit_ex = {};
1152
1153	my $all_text="";
1154
1155	# We want high quality dc metadata to go in first, so we store all the
1156	# assigned dc.* values first. Then, for all those dc metadata names in
1157	# the official dc set that are as yet unassigned, we look to see whether
1158	# embedded ex.dc.* metadata has defined some values for them. If not,
1159	# then for the same missing dc metadata names, we look in ex metadata.
1160
1161	foreach my $data (@{$section_ptr->{'metadata'}}){
1162	my $escaped_value = &docprint::escape_text($data->[1]);
1163	if ($data->[0]=~ m/^dc\./) {
1164	$data->[0] =~ tr/[A-Z]/[a-z]/;
1165
1166	$data->[0] =~ m/^dc\.(.*)/;
1167	my $dc_element = $1;
1168
1169	if (!defined $explicit_dc->{$dc_element}) {
1170	$explicit_dc->{$dc_element} = [];
1171	}
1172	push(@{$explicit_dc->{$dc_element}},$escaped_value);
1173
1174	if (defined $version && ($version eq "oai_dc")) {
1175	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1176	}
1177	else {
1178	# qualifier???
1179	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1180	}
1181
1182	} elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1183	$data->[0] =~ m/^ex\.dc\.(.*)/;
1184	my $ex_dc_element = $1;
1185	my $lc_ex_dc_element = lc($ex_dc_element);
1186
1187	# only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1188	if (defined $dc_set->{$ex_dc_element}) {
1189	if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1190	$explicit_ex_dc->{$lc_ex_dc_element} = [];
1191	}
1192	push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1193	}
1194	}
1195	elsif (($data->[0] =~ m/^ex\./) \|\| ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1196	$data->[0] =~ m/^(ex\.)?(.*)/;
1197	my $ex_element = $2;
1198	my $lc_ex_element = lc($ex_element);
1199
1200	if (defined $dc_set->{$ex_element}) {
1201	if (!defined $explicit_ex->{$lc_ex_element}) {
1202	$explicit_ex->{$lc_ex_element} = [];
1203	}
1204	push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1205	}
1206	}
1207	}
1208
1209	# go through dc_set and for any element not defined in explicit_dc
1210	# that does exist in explicit_ex, add it in as metadata
1211	foreach my $k ( keys %$dc_set ) {
1212	my $lc_k = lc($k);
1213
1214	if (!defined $explicit_dc->{$lc_k}) {
1215	# try to find if ex.dc.* defines this dc.* meta,
1216	# if not, then look for whether there's an ex.* equivalent
1217
1218	if (defined $explicit_ex_dc->{$lc_k}) {
1219	foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1220	my $dc_element = $lc_k;
1221	my $escaped_value = $v;
1222
1223	if (defined $version && ($version eq "oai_dc")) {
1224	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1225	}
1226	else {
1227	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1228	}
1229	}
1230	} elsif (defined $explicit_ex->{$lc_k}) {
1231	foreach my $v (@{$explicit_ex->{$lc_k}}) {
1232	my $dc_element = $lc_k;
1233	my $escaped_value = $v;
1234
1235	if (defined $version && ($version eq "oai_dc")) {
1236	$all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1237	}
1238	else {
1239	$all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1240	}
1241	}
1242	}
1243	}
1244	}
1245
1246	if ($all_text eq "") {
1247	$all_text .= " There is no Dublin Core metatdata in this document\n";
1248	}
1249	$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1250
1251	return $all_text;
1252	}
1253
1254	# Build up dublin_core metadata. Priority given to dc.* over ex.*
1255	# This method was apparently added by Jeffrey and committed by Shaoqun.
1256	# But we don't know why it was added, so not using it anymore.
1257	sub new_get_dc_metadata {
1258
1259	my $self = shift(@_);
1260	my ($doc_obj, $section, $version) = @_;
1261
1262	# build up string of dublin core metadata
1263	$section="" unless defined $section;
1264
1265	my $section_ptr=$doc_obj->_lookup_section($section);
1266	return "" unless defined $section_ptr;
1267
1268	my $all_text = "";
1269	foreach my $data (@{$section_ptr->{'metadata'}}){
1270	my $escaped_value = &docprint::escape_text($data->[1]);
1271	my $dc_element = $data->[0];
1272
1273	my @array = split('\.',$dc_element);
1274	my ($type,$name);
1275
1276	if(defined $array[1])
1277	{
1278	$type = $array[0];
1279	$name = $array[1];
1280	}
1281	else
1282	{
1283	$type = "ex";
1284	$name = $array[0];
1285	}
1286
1287	$all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1288	}
1289	return $all_text;
1290	}
1291
1292
1293	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: