Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 23182

Last change on this file since 23182 was 23182, checked in by kjdon, 14 years ago
fixed up bug with deleting assoc files. Was fine for a delete, but for an update, need to delete the old ones before adding the new ones. And also, don't delete the lucene text doc when updating. Presumably it has already been overwritten with the correct version
Property svn:keywords set to `Author Date Id Revision`
File size: 22.3 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41
42	BEGIN {
43	@basebuildproc::ISA = ('docproc');
44	}
45
46	sub new()
47	{
48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49	my $self = new docproc ();
50
51	# outhandle is where all the debugging info goes
52	# output_handle is where the output of the plugins is piped
53	# to (i.e. mg, database etc.)
54	$outhandle = STDERR unless defined $outhandle;
55
56	$self->{'collection'} = $collection;
57	$self->{'source_dir'} = $source_dir;
58	$self->{'build_dir'} = $build_dir;
59	$self->{'keepold'} = $keepold;
60	$self->{'verbosity'} = $verbosity;
61	$self->{'outhandle'} = $outhandle;
62
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontdb'} = {};
67	$self->{'store_metadata_coverage'} = "false";
68
69	$self->{'index'} = "section:text";
70	$self->{'indexexparr'} = [];
71
72	$self->{'separate_cjk'} = 0;
73
74	my $found_num_data = 0;
75	my $buildconfigfile = undef;
76
77	if ($keepold) {
78	# For incremental building need to seed num_docs etc from values
79	# stored in build.cfg (if present)
80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81	if (-e $buildconfigfile) {
82	$found_num_data = 1;
83	}
84	else {
85	# try the index dir
86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87	"index", "build.cfg");
88	if (-e $buildconfigfile) {
89	$found_num_data = 1;
90	}
91	}
92
93	}
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the database - indexer intersection is
123	$self->{'db_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	# reconstructed docs have no text, just metadata, so we need to
149	# remember how many bytes we had initially
150	#$self->{'num_bytes'} = $self->{'starting_num_bytes'};
151	$self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
152	$self->{'num_processed_bytes'} = 0;
153	}
154
155	sub is_incremental_capable
156	{
157	# By default we return 'no' as the answer
158	# Safer to assume non-incremental to start with, and then override in
159	# inherited classes that are.
160
161	return 0;
162	}
163
164	sub get_num_docs {
165	my $self = shift (@_);
166
167	return $self->{'num_docs'};
168	}
169
170	sub get_num_sections {
171	my $self = shift (@_);
172
173	return $self->{'num_sections'};
174	}
175
176	# num_bytes is the actual number of bytes in the collection
177	# this is normally the same as what's processed during text compression
178	sub get_num_bytes {
179	my $self = shift (@_);
180
181	return $self->{'num_bytes'};
182	}
183
184	# num_processed_bytes is the number of bytes actually passed
185	# to mg for the current index
186	sub get_num_processed_bytes {
187	my $self = shift (@_);
188
189	return $self->{'num_processed_bytes'};
190	}
191
192	sub set_output_handle {
193	my $self = shift (@_);
194	my ($handle) = @_;
195
196	$self->{'output_handle'} = $handle;
197	binmode($handle,":utf8");
198	}
199
200
201	sub set_mode {
202	my $self = shift (@_);
203	my ($mode) = @_;
204
205	$self->{'mode'} = $mode;
206	}
207
208	sub get_mode {
209	my $self = shift (@_);
210
211	return $self->{'mode'};
212	}
213
214	sub set_assocdir {
215	my $self = shift (@_);
216	my ($assocdir) = @_;
217
218	$self->{'assocdir'} = $assocdir;
219	}
220
221	sub set_dontdb {
222	my $self = shift (@_);
223	my ($dontdb) = @_;
224
225	$self->{'dontdb'} = $dontdb;
226	}
227
228	sub set_infodbtype
229	{
230	my $self = shift(@_);
231	my $infodbtype = shift(@_);
232	$self->{'infodbtype'} = $infodbtype;
233	}
234
235	sub set_index {
236	my $self = shift (@_);
237	my ($index, $indexexparr) = @_;
238
239	$self->{'index'} = $index;
240	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
241	}
242
243	sub set_index_languages {
244	my $self = shift (@_);
245	my ($lang_meta, $langarr) = @_;
246	$lang_meta =~ s/^ex\.//; # strip ex. if there
247	$self->{'lang_meta'} = $lang_meta;
248	$self->{'langarr'} = $langarr;
249	}
250
251	sub get_index {
252	my $self = shift (@_);
253
254	return $self->{'index'};
255	}
256
257	sub set_classifiers {
258	my $self = shift (@_);
259	my ($classifiers) = @_;
260
261	$self->{'classifiers'} = $classifiers;
262	}
263
264	sub set_indexing_text {
265	my $self = shift (@_);
266	my ($indexing_text) = @_;
267
268	$self->{'indexing_text'} = $indexing_text;
269	}
270
271	sub get_indexing_text {
272	my $self = shift (@_);
273
274	return $self->{'indexing_text'};
275	}
276
277	sub set_store_text {
278	my $self = shift (@_);
279	my ($store_text) = @_;
280
281	$self->{'store_text'} = $store_text;
282	}
283
284	sub set_store_metadata_coverage {
285	my $self = shift (@_);
286	my ($store_metadata_coverage) = @_;
287
288	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
289	}
290
291	sub get_doc_list {
292	my $self = shift(@_);
293
294	return @{$self->{'doclist'}};
295	}
296
297	# the standard database level is section, but you may want to change it to document
298	sub set_db_level {
299	my $self= shift (@_);
300	my ($db_level) = @_;
301
302	$self->{'db_level'} = $db_level;
303	}
304
305	sub set_sections_index_document_metadata {
306	my $self= shift (@_);
307	my ($index_type) = @_;
308
309	$self->{'sections_index_document_metadata'} = $index_type;
310	}
311
312	sub set_separate_cjk {
313	my $self = shift (@_);
314	my ($sep_cjk) = @_;
315
316	$self->{'separate_cjk'} = $sep_cjk;
317	}
318
319	sub process {
320	my $self = shift (@_);
321	my $method = $self->{'mode'};
322
323	$self->$method(@_);
324	}
325
326	# post process text depending on field. Currently don't do anything here
327	# except cjk separation, and only for indexing
328	# should only do this for indexed text (if $self->{'indexing_text'}),
329	# but currently search term highlighting doesn't work if you do that.
330	# once thats fixed up, then fix this.
331	sub filter_text {
332	my $self = shift (@_);
333	my ($field, $text) = @_;
334
335	# lets do cjk seg here
336	my $new_text =$text;
337	if ($self->{'separate_cjk'}) {
338	$new_text = &cnseg::segment($text);
339	}
340	return $new_text;
341	}
342
343
344	sub infodb_metadata_stats
345	{
346	my $self = shift (@_);
347	my ($field,$edit_mode) = @_;
348
349	# Keep some statistics relating to metadata sets used and
350	# frequency of particular metadata fields within each set
351
352	# Union of metadata prefixes and frequency of fields
353	# (both scoped for this document alone, and across whole collection)
354
355	if ($field =~ m/^(.+)\.(.*)$/) {
356	my $prefix = $1;
357	my $core_field = $2;
358
359	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
360	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
361	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
362	}
363	else {
364	# delete
365	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
366	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
367	}
368
369	}
370	elsif ($field =~ m/^[[:upper:]]/) {
371	# implicit 'ex' metadata set
372
373	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
374
375	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
376	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
377	}
378	else {
379	# delete
380	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
381	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
382	}
383	}
384
385	}
386
387
388	sub infodbedit {
389	my $self = shift (@_);
390	my ($doc_obj, $filename, $edit_mode) = @_;
391
392	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
393	my $doctype = $doc_obj->get_doc_type();
394	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
395
396	my $archivedir = "";
397	if (defined $filename)
398	{
399	# doc_obj derived directly from file
400	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
401	$dir = "" unless defined $dir;
402	$dir =~ s/\\/\//g;
403	$dir =~ s/^\/+//;
404	$dir =~ s/\/+$//;
405
406	$archivedir = $dir;
407
408	if ($edit_mode eq "delete") {
409	# record this doc so we don't process the reconstructed doc later
410	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
411	# we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
412	$self->delete_assoc_files ($archivedir, "delete");
413	return;
414	}
415	if ($edit_mode eq "update") {
416	# we don't want to process the reconstructed doc later, but we will process this version now.
417	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
418	# delete the old assoc files as they may have changed
419	$self->delete_assoc_files ($archivedir, "update");
420	}
421
422	# resolve the final filenames of the files associated with this document
423	# now save the new assoc files for an update/new doc.
424	$self->assoc_files ($doc_obj, $archivedir);
425	}
426	else
427	{
428	# doc_obj reconstructed from database (has metadata, doc structure but no text)
429	my $top_section = $doc_obj->get_top_section();
430	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
431	}
432
433	# rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
434
435	#add this document to the browse structure
436	push(@{$self->{'doclist'}},$doc_obj->get_OID())
437	unless ($doctype eq "classification");
438	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
439
440	if (!defined $filename) {
441	# a reconstructed doc
442	my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
443	if (defined $num_reconstructed_bytes) {
444	$self->{'num_bytes'} += $num_reconstructed_bytes;
445	}
446	}
447	# classify the document
448	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
449
450	# now add all the sections to the infodb.
451
452	# is this a paged or a hierarchical document
453	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
454
455	my $section = $doc_obj->get_top_section ();
456	my $doc_OID = $doc_obj->get_OID();
457	my $first = 1;
458	my $infodb_handle = $self->{'output_handle'};
459
460	$self->{'doc_mdprefix_fields'} = {};
461
462	while (defined $section)
463	{
464	my $section_OID = $doc_OID;
465	if ($section ne "")
466	{
467	$section_OID = $doc_OID . "." . $section;
468	}
469	my %section_infodb = ();
470
471	# update a few statistics
472	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
473	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
474
475	# output the fact that this document is a document (unless doctype
476	# has been set to something else from within a plugin
477	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
478	if (!defined $dtype \|\| $dtype !~ /\w/) {
479	$section_infodb{"doctype"} = [ "doc" ];
480	}
481
482	if ($first && defined $filename) {
483	# if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
484	my $length = $doc_obj->get_total_text_length();
485	$section_infodb{"total_numbytes"} = [ $length ];
486	}
487	# Output whether this node contains text
488	#
489	# If doc_obj reconstructed from database file then no need to
490	# explicitly add <hastxt> as this is preserved as metadata when
491	# the database file is loaded in
492	if (defined $filename)
493	{
494	# doc_obj derived directly from file
495	if ($doc_obj->get_text_length($section) > 0) {
496	$section_infodb{"hastxt"} = [ "1" ];
497	} else {
498	$section_infodb{"hastxt"} = [ "0" ];
499	}
500	}
501
502	# output all the section metadata
503	my $metadata = $doc_obj->get_all_metadata ($section);
504	foreach my $pair (@$metadata) {
505	my ($field, $value) = (@$pair);
506
507	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
508	defined $value && $value ne "") {
509
510	# escape problematic stuff
511	$value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
512	$value =~ s/\n/\\n/g;
513	$value =~ s/\r/\\r/g;
514	# remove ex. if there
515	$field =~ s/^ex\.//;
516
517	# special case for URL metadata
518	if ($field =~ /^URL$/i) {
519	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
520	}
521
522	if (!defined $self->{'dontdb'}->{$field}) {
523	push(@{$section_infodb{$field}}, $value);
524
525	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
526	{
527	$self->infodb_metadata_stats($field,$edit_mode);
528	}
529	}
530	}
531	}
532
533	if ($section eq "")
534	{
535	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
536
537	foreach my $prefix (keys %$doc_mdprefix_fields)
538	{
539	push(@{$section_infodb{"metadataset"}}, $prefix);
540
541	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
542	{
543	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
544
545	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
546	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
547	}
548	}
549	}
550
551	# If doc_obj reconstructed from database file then no need to
552	# explicitly add <archivedir> as this is preserved as metadata when
553	# the database file is loaded in
554	if (defined $filename)
555	{
556	# output archivedir if at top level
557	if ($section eq $doc_obj->get_top_section()) {
558	$section_infodb{"archivedir"} = [ $archivedir ];
559	}
560	}
561
562	# output document display type
563	if ($first) {
564	$section_infodb{"thistype"} = [ $thistype ];
565	}
566
567	if ($self->{'db_level'} eq "document") {
568	# doc num is num_docs not num_sections
569	# output the matching document number
570	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
571	}
572	else {
573	# output a list of children
574	my $children = $doc_obj->get_children ($section);
575	if (scalar(@$children) > 0) {
576	$section_infodb{"childtype"} = [ $childtype ];
577	my $contains = "";
578	foreach my $child (@$children)
579	{
580	$contains .= ";" unless ($contains eq "");
581	if ($child =~ /^.*?\.(\d+)$/)
582	{
583	$contains .= "\".$1";
584	}
585	else
586	{
587	$contains .= "\".$child";
588	}
589	}
590	$section_infodb{"contains"} = [ $contains ];
591	}
592	# output the matching doc number
593	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
594	}
595
596	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
597
598	# output a database entry for the document number, unless we are incremental
599	unless ($self->is_incremental_capable())
600	{
601	if ($self->{'db_level'} eq "document") {
602	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
603	}
604	else {
605	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
606	}
607	}
608
609	$first = 0;
610	$section = $doc_obj->get_next_section($section);
611	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
612	} # while defined section
613
614	}
615
616
617
618
619	sub infodb {
620	my $self = shift (@_);
621	my ($doc_obj, $filename) = @_;
622
623	$self->infodbedit($doc_obj,$filename,"add");
624	}
625
626	sub infodbreindex {
627	my $self = shift (@_);
628	my ($doc_obj, $filename) = @_;
629
630	$self->infodbedit($doc_obj,$filename,"update");
631	}
632
633	sub infodbdelete {
634	my $self = shift (@_);
635	my ($doc_obj, $filename) = @_;
636
637	$self->infodbedit($doc_obj,$filename,"delete");
638	}
639
640
641	sub text {
642	my $self = shift (@_);
643	my ($doc_obj) = @_;
644
645	my $handle = $self->{'outhandle'};
646	print $handle "basebuildproc::text function must be implemented in sub classes\n";
647	die "\n";
648	}
649
650	sub textreindex
651	{
652	my $self = shift @_;
653
654	my $outhandle = $self->{'outhandle'};
655	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
656	if (!$self->is_incremental_capable()) {
657
658	print $outhandle " This operation is only possible with indexing tools with that support\n";
659	print $outhandle " incremental building\n";
660	}
661	die "\n";
662	}
663
664	sub textdelete
665	{
666	my $self = shift @_;
667
668	my $outhandle = $self->{'outhandle'};
669	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
670	if (!$self->is_incremental_capable()) {
671
672	print $outhandle " This operation is only possible with indexing tools with that support\n";
673	print $outhandle " incremental building\n";
674	}
675	die "\n";
676	}
677
678
679	# should the document be indexed - according to the subcollection and language
680	# specification.
681	sub is_subcollection_doc {
682	my $self = shift (@_);
683	my ($doc_obj) = @_;
684
685	my $indexed_doc = 1;
686	foreach my $indexexp (@{$self->{'indexexparr'}}) {
687	$indexed_doc = 0;
688	my ($field, $exp, $options) = split /\//, $indexexp;
689	if (defined ($field) && defined ($exp)) {
690	my ($bool) = $field =~ /^(.)/;
691	$field =~ s/^.// if $bool eq '!';
692	my @metadata_values;
693	if ($field =~ /^filename$/i) {
694	push(@metadata_values, $doc_obj->get_source_filename());
695	}
696	else {
697	$field =~ s/^ex\.//; #strip ex. if present
698	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
699	}
700	next unless @metadata_values;
701	foreach my $metadata_value (@metadata_values) {
702	if ($bool eq '!') {
703	if ($options =~ /^i$/i) {
704	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
705	} else {
706	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
707	}
708	} else {
709	if ($options =~ /^i$/i) {
710	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
711	} else {
712	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
713	}
714	}
715	}
716
717	last if ($indexed_doc == 1);
718	}
719	}
720
721	# if this doc is so far in the sub collection, and we have lang info,
722	# now we check the languages to see if it matches
723	if($indexed_doc && defined $self->{'lang_meta'}) {
724	$indexed_doc = 0;
725	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
726	if (defined $field) {
727	foreach my $lang (@{$self->{'langarr'}}) {
728	my ($bool) = $lang =~ /^(.)/;
729	if ($bool eq '!') {
730	$lang =~ s/^.//;
731	if ($field !~ /$lang/) {
732	$indexed_doc = 1; last;
733	}
734	} else {
735	if ($field =~ /$lang/) {
736	$indexed_doc = 1; last;
737	}
738	}
739	}
740	}
741	}
742	return $indexed_doc;
743
744	}
745
746	# use 'Paged' if document has no more than 2 levels
747	# and each section at second level has a number for
748	# Title metadata
749	# also use Paged if gsdlthistype metadata is set to Paged
750	sub get_document_type {
751	my $self = shift (@_);
752	my ($doc_obj) = @_;
753
754	my $thistype = "VList";
755	my $childtype = "VList";
756	my $title;
757	my @tmp = ();
758
759	my $section = $doc_obj->get_top_section ();
760
761	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
762	if (defined $gsdlthistype) {
763	if ($gsdlthistype eq "Paged") {
764	$childtype = "Paged";
765	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
766	$thistype = "Paged";
767	} else {
768	$thistype = "Invisible";
769	}
770
771	return ($thistype, $childtype);
772	} elsif ($gsdlthistype eq "Hierarchy") {
773	return ($thistype, $childtype); # use VList, VList
774	}
775	}
776	my $first = 1;
777	while (defined $section) {
778	@tmp = split /\./, $section;
779	if (scalar(@tmp) > 1) {
780	return ($thistype, $childtype);
781	}
782	if (!$first) {
783	$title = $doc_obj->get_metadata_element ($section, "Title");
784	if (!defined $title \|\| $title !~ /^\d+$/) {
785	return ($thistype, $childtype);
786	}
787	}
788	$first = 0;
789	$section = $doc_obj->get_next_section($section);
790	}
791	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
792	$thistype = "Paged";
793	} else {
794	$thistype = "Invisible";
795	}
796	$childtype = "Paged";
797	return ($thistype, $childtype);
798	}
799
800	sub assoc_files
801	{
802	my $self = shift (@_);
803	my ($doc_obj, $archivedir) = @_;
804	my ($afile);
805
806	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
807	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
808	# if assoc file starts with a slash, we put it relative to the assoc
809	# dir, otherwise it is relative to the HASH... directory
810	if ($assoc_file->[1] =~ m@^[/\\]@) {
811	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
812	} else {
813	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
814	}
815	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
816	}
817	}
818
819	sub delete_assoc_files
820	{
821	my $self = shift (@_);
822	my ($archivedir, $edit_mode) = @_;
823
824	my $assoc_dir = &util::filename_cat($self->{'assocdir'}, $archivedir);
825	if (-d $assoc_dir) {
826	&util::rm_r($assoc_dir);
827	}
828	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: