Context Navigation

source: gsdl/trunk/perllib/basebuildproc.pm@ 19617

Last change on this file since 19617 was 18508, checked in by davidb, 15 years ago
Had to move location of where deletion of archive files was done
Property svn:keywords set to `Author Date Id Revision`
File size: 21.7 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41
42	BEGIN {
43	@basebuildproc::ISA = ('docproc');
44	}
45
46	sub new()
47	{
48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49	my $self = new docproc ();
50
51	# outhandle is where all the debugging info goes
52	# output_handle is where the output of the plugins is piped
53	# to (i.e. mg, database etc.)
54	$outhandle = STDERR unless defined $outhandle;
55
56	$self->{'collection'} = $collection;
57	$self->{'source_dir'} = $source_dir;
58	$self->{'build_dir'} = $build_dir;
59	$self->{'keepold'} = $keepold;
60	$self->{'verbosity'} = $verbosity;
61	$self->{'outhandle'} = $outhandle;
62
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontdb'} = {};
67	$self->{'store_metadata_coverage'} = "false";
68
69	$self->{'index'} = "section:text";
70	$self->{'indexexparr'} = [];
71
72	$self->{'separate_cjk'} = 0;
73
74	my $found_num_data = 0;
75	my $buildconfigfile = undef;
76
77	if ($keepold) {
78	# For incremental building need to seed num_docs etc from values
79	# stored in build.cfg (if present)
80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81	if (-e $buildconfigfile) {
82	$found_num_data = 1;
83	}
84	else {
85	# try the index dir
86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87	"index", "build.cfg");
88	if (-e $buildconfigfile) {
89	$found_num_data = 1;
90	}
91	}
92
93	}
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the database - indexer intersection is
123	$self->{'db_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	# reconstructed docs have no text, just metadata, so we need to
149	# remember how many bytes we had initially
150	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
151
152	$self->{'num_processed_bytes'} = 0;
153	}
154
155	sub is_incremental_capable
156	{
157	# By default we return 'no' as the answer
158	# Safer to assume non-incremental to start with, and then override in
159	# inherited classes that are.
160
161	return 0;
162	}
163
164	sub get_num_docs {
165	my $self = shift (@_);
166
167	return $self->{'num_docs'};
168	}
169
170	sub get_num_sections {
171	my $self = shift (@_);
172
173	return $self->{'num_sections'};
174	}
175
176	# num_bytes is the actual number of bytes in the collection
177	# this is normally the same as what's processed during text compression
178	sub get_num_bytes {
179	my $self = shift (@_);
180
181	return $self->{'num_bytes'};
182	}
183
184	# num_processed_bytes is the number of bytes actually passed
185	# to mg for the current index
186	sub get_num_processed_bytes {
187	my $self = shift (@_);
188
189	return $self->{'num_processed_bytes'};
190	}
191
192	sub set_output_handle {
193	my $self = shift (@_);
194	my ($handle) = @_;
195
196	$self->{'output_handle'} = $handle;
197	}
198
199
200	sub set_mode {
201	my $self = shift (@_);
202	my ($mode) = @_;
203
204	$self->{'mode'} = $mode;
205	}
206
207	sub get_mode {
208	my $self = shift (@_);
209
210	return $self->{'mode'};
211	}
212
213	sub set_assocdir {
214	my $self = shift (@_);
215	my ($assocdir) = @_;
216
217	$self->{'assocdir'} = $assocdir;
218	}
219
220	sub set_dontdb {
221	my $self = shift (@_);
222	my ($dontdb) = @_;
223
224	$self->{'dontdb'} = $dontdb;
225	}
226
227	sub set_infodbtype
228	{
229	my $self = shift(@_);
230	my $infodbtype = shift(@_);
231	$self->{'infodbtype'} = $infodbtype;
232	}
233
234	sub set_index {
235	my $self = shift (@_);
236	my ($index, $indexexparr) = @_;
237
238	$self->{'index'} = $index;
239	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
240	}
241
242	sub set_index_languages {
243	my $self = shift (@_);
244	my ($lang_meta, $langarr) = @_;
245	$self->{'lang_meta'} = $lang_meta;
246	$self->{'langarr'} = $langarr;
247	}
248
249	sub get_index {
250	my $self = shift (@_);
251
252	return $self->{'index'};
253	}
254
255	sub set_classifiers {
256	my $self = shift (@_);
257	my ($classifiers) = @_;
258
259	$self->{'classifiers'} = $classifiers;
260	}
261
262	sub set_indexing_text {
263	my $self = shift (@_);
264	my ($indexing_text) = @_;
265
266	$self->{'indexing_text'} = $indexing_text;
267	}
268
269	sub get_indexing_text {
270	my $self = shift (@_);
271
272	return $self->{'indexing_text'};
273	}
274
275	sub set_store_text {
276	my $self = shift (@_);
277	my ($store_text) = @_;
278
279	$self->{'store_text'} = $store_text;
280	}
281
282	sub set_store_metadata_coverage {
283	my $self = shift (@_);
284	my ($store_metadata_coverage) = @_;
285
286	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
287	}
288
289	sub get_doc_list {
290	my $self = shift(@_);
291
292	return @{$self->{'doclist'}};
293	}
294
295	# the standard database level is section, but you may want to change it to document
296	sub set_db_level {
297	my $self= shift (@_);
298	my ($db_level) = @_;
299
300	$self->{'db_level'} = $db_level;
301	}
302
303	sub set_sections_index_document_metadata {
304	my $self= shift (@_);
305	my ($index_type) = @_;
306
307	$self->{'sections_index_document_metadata'} = $index_type;
308	}
309
310	sub set_separate_cjk {
311	my $self = shift (@_);
312	my ($sep_cjk) = @_;
313
314	$self->{'separate_cjk'} = $sep_cjk;
315	}
316
317	sub process {
318	my $self = shift (@_);
319	my $method = $self->{'mode'};
320
321	$self->$method(@_);
322	}
323
324	# post process text depending on field. Currently don't do anything here
325	# except cjk separation, and only for indexing
326	# should only do this for indexed text (if $self->{'indexing_text'}),
327	# but currently search term highlighting doesn't work if you do that.
328	# once thats fixed up, then fix this.
329	sub filter_text {
330	my $self = shift (@_);
331	my ($field, $text) = @_;
332
333	# lets do cjk seg here
334	my $new_text =$text;
335	if ($self->{'separate_cjk'}) {
336	$new_text = &cnseg::segment($text);
337	}
338	return $new_text;
339	}
340
341
342	sub infodb_metadata_stats
343	{
344	my $self = shift (@_);
345	my ($field,$edit_mode) = @_;
346
347	# Keep some statistics relating to metadata sets used and
348	# frequency of particular metadata fields within each set
349
350	# Union of metadata prefixes and frequency of fields
351	# (both scoped for this document alone, and across whole collection)
352
353	if ($field =~ m/^(.+)\.(.*)$/) {
354	my $prefix = $1;
355	my $core_field = $2;
356
357	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
358	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
359	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
360	}
361	else {
362	# delete
363	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
364	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
365	}
366
367	}
368	elsif ($field =~ m/^[[:upper:]]/) {
369	# implicit 'ex' metadata set
370
371	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
372
373	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
374	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
375	}
376	else {
377	# delete
378	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
379	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
380	}
381	}
382
383	}
384
385
386	sub infodbedit {
387	my $self = shift (@_);
388	my ($doc_obj, $filename, $edit_mode) = @_;
389
390	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
391	my $doctype = $doc_obj->get_doc_type();
392	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
393
394	my $archivedir = "";
395	if (defined $filename)
396	{
397	# doc_obj derived directly from file
398	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
399	$dir = "" unless defined $dir;
400	$dir =~ s/\\/\//g;
401	$dir =~ s/^\/+//;
402	$dir =~ s/\/+$//;
403
404	$archivedir = $dir;
405
406	# resolve the final filenames of the files associated with this document
407	$self->assoc_files ($doc_obj, $archivedir);
408	}
409	else
410	{
411	# doc_obj reconstructed from database (has metadata, doc structure but no text)
412	my $top_section = $doc_obj->get_top_section();
413	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
414	}
415
416	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
417	#add this document to the browse structure
418	push(@{$self->{'doclist'}},$doc_obj->get_OID())
419	unless ($doctype eq "classification");
420	}
421	else {
422	# delete => remove this doc from browse structure
423	my $del_doc_oid = $doc_obj->get_OID();
424
425	my @filtered_doc_list = ();
426	foreach my $oid (@{$self->{'doclist'}}) {
427	push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
428	}
429	$self->{'doclist'} = \@filtered_doc_list;
430	}
431
432
433	# classify this document
434	&classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
435
436	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
437	# this is another document
438	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
439	}
440	else {
441	# delete
442	$self->{'num_docs'} -= 1 unless ($doctype eq "classification");
443	}
444
445	# is this a paged or a hierarchical document
446	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
447
448	my $section = $doc_obj->get_top_section ();
449	my $doc_OID = $doc_obj->get_OID();
450	my $first = 1;
451	my $infodb_handle = $self->{'output_handle'};
452
453	$self->{'doc_mdprefix_fields'} = {};
454
455	while (defined $section)
456	{
457	my $section_OID = $doc_OID;
458	if ($section ne "")
459	{
460	$section_OID = $doc_OID . "." . $section;
461	}
462	my %section_infodb = ();
463
464	# update a few statistics
465	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
466
467	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
468	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
469	}
470	else {
471	# delete
472	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
473	$self->{'num_sections'} -= 1 unless ($doctype eq "classification");
474	}
475
476	# output the fact that this document is a document (unless doctype
477	# has been set to something else from within a plugin
478	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
479	if (!defined $dtype \|\| $dtype !~ /\w/) {
480	$section_infodb{"doctype"} = [ "doc" ];
481	}
482
483	# Output whether this node contains text
484	#
485	# If doc_obj reconstructed from database file then no need to
486	# explicitly add <hastxt> as this is preserved as metadata when
487	# the database file is loaded in
488	if (defined $filename)
489	{
490	# doc_obj derived directly from file
491	if ($doc_obj->get_text_length($section) > 0) {
492	$section_infodb{"hastxt"} = [ "1" ];
493	} else {
494	$section_infodb{"hastxt"} = [ "0" ];
495	}
496	}
497
498	# output all the section metadata
499	my $metadata = $doc_obj->get_all_metadata ($section);
500	foreach my $pair (@$metadata) {
501	my ($field, $value) = (@$pair);
502
503	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
504	defined $value && $value ne "") {
505
506	# escape problematic stuff
507	$value =~ s/\\/\\\\/g;
508	$value =~ s/\n/\\n/g;
509	$value =~ s/\r/\\r/g;
510
511	# special case for URL metadata
512	if ($field =~ /^URL$/i) {
513	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
514
515	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
516	}
517	else {
518	# delete
519	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
520	}
521
522
523	}
524
525	if (!defined $self->{'dontdb'}->{$field}) {
526	push(@{$section_infodb{$field}}, $value);
527
528	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
529	{
530	$self->infodb_metadata_stats($field,$edit_mode);
531	}
532	}
533	}
534	}
535
536	if ($section eq "")
537	{
538	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
539
540	foreach my $prefix (keys %$doc_mdprefix_fields)
541	{
542	push(@{$section_infodb{"metadataset"}}, $prefix);
543
544	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
545	{
546	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
547
548	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
549	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
550	}
551	}
552	}
553
554	# If doc_obj reconstructed from database file then no need to
555	# explicitly add <archivedir> as this is preserved as metadata when
556	# the database file is loaded in
557	if (defined $filename)
558	{
559	# output archivedir if at top level
560	if ($section eq $doc_obj->get_top_section()) {
561	$section_infodb{"archivedir"} = [ $archivedir ];
562	}
563	}
564
565	# output document display type
566	if ($first) {
567	$section_infodb{"thistype"} = [ $thistype ];
568	}
569
570	if ($self->{'db_level'} eq "document") {
571	# doc num is num_docs not num_sections
572	# output the matching document number
573	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
574	}
575	else {
576	# output a list of children
577	my $children = $doc_obj->get_children ($section);
578	if (scalar(@$children) > 0) {
579	$section_infodb{"childtype"} = [ $childtype ];
580	my $contains = "";
581	foreach my $child (@$children)
582	{
583	$contains .= ";" unless ($contains eq "");
584	if ($child =~ /^.*?\.(\d+)$/)
585	{
586	$contains .= "\".$1";
587	}
588	else
589	{
590	$contains .= "\".$child";
591	}
592	}
593	$section_infodb{"contains"} = [ $contains ];
594	}
595	# output the matching doc number
596	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
597	}
598
599	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
600
601	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
602	}
603	else {
604	# delete
605	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
606	}
607
608
609	# output a database entry for the document number, except for Lucene (which no longer needs this information)
610	unless (ref($self) eq "lucenebuildproc")
611	{
612	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
613
614	if ($self->{'db_level'} eq "document") {
615	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
616	}
617	else {
618	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
619	}
620	}
621	else {
622
623	if ($self->{'db_level'} eq "document") {
624	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
625	}
626	else {
627	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
628	}
629
630	}
631	}
632
633	$first = 0;
634	$section = $doc_obj->get_next_section($section);
635	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
636	}
637	}
638
639
640
641
642	sub infodb {
643	my $self = shift (@_);
644	my ($doc_obj, $filename) = @_;
645
646	$self->infodbedit($doc_obj,$filename,"add");
647	}
648
649	sub infodbreindex {
650	my $self = shift (@_);
651	my ($doc_obj, $filename) = @_;
652
653	$self->infodbedit($doc_obj,$filename,"update");
654	}
655
656	sub infodbdelete {
657	my $self = shift (@_);
658	my ($doc_obj, $filename) = @_;
659
660	$self->infodbedit($doc_obj,$filename,"delete");
661	}
662
663
664	sub text {
665	my $self = shift (@_);
666	my ($doc_obj) = @_;
667
668	my $handle = $self->{'outhandle'};
669	print $handle "basebuildproc::text function must be implemented in sub classes\n";
670	die "\n";
671	}
672
673	sub textreindex
674	{
675	my $self = shift @_;
676
677	my $outhandle = $self->{'outhandle'};
678	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
679	if (!$self->is_incremental_capable()) {
680
681	print $outhandle " This operation is only possible with indexing tools with that support\n";
682	print $outhandle " incremental building\n";
683	}
684	die "\n";
685	}
686
687	sub textdelete
688	{
689	my $self = shift @_;
690
691	my $outhandle = $self->{'outhandle'};
692	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
693	if (!$self->is_incremental_capable()) {
694
695	print $outhandle " This operation is only possible with indexing tools with that support\n";
696	print $outhandle " incremental building\n";
697	}
698	die "\n";
699	}
700
701
702	# should the document be indexed - according to the subcollection and language
703	# specification.
704	sub is_subcollection_doc {
705	my $self = shift (@_);
706	my ($doc_obj) = @_;
707
708	my $indexed_doc = 1;
709	foreach my $indexexp (@{$self->{'indexexparr'}}) {
710	$indexed_doc = 0;
711	my ($field, $exp, $options) = split /\//, $indexexp;
712	if (defined ($field) && defined ($exp)) {
713	my ($bool) = $field =~ /^(.)/;
714	$field =~ s/^.// if $bool eq '!';
715	my @metadata_values;
716	if ($field =~ /^filename$/i) {
717	push(@metadata_values, $doc_obj->get_source_filename());
718	}
719	else {
720	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
721	}
722	next unless @metadata_values;
723	foreach my $metadata_value (@metadata_values) {
724	if ($bool eq '!') {
725	if ($options =~ /^i$/i) {
726	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
727	} else {
728	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
729	}
730	} else {
731	if ($options =~ /^i$/i) {
732	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
733	} else {
734	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
735	}
736	}
737	}
738
739	last if ($indexed_doc == 1);
740	}
741	}
742
743	# if this doc is so far in the sub collection, and we have lang info,
744	# now we check the languages to see if it matches
745	if($indexed_doc && defined $self->{'lang_meta'}) {
746	$indexed_doc = 0;
747	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
748	if (defined $field) {
749	foreach my $lang (@{$self->{'langarr'}}) {
750	my ($bool) = $lang =~ /^(.)/;
751	if ($bool eq '!') {
752	$lang =~ s/^.//;
753	if ($field !~ /$lang/) {
754	$indexed_doc = 1; last;
755	}
756	} else {
757	if ($field =~ /$lang/) {
758	$indexed_doc = 1; last;
759	}
760	}
761	}
762	}
763	}
764	return $indexed_doc;
765
766	}
767
768	# use 'Paged' if document has no more than 2 levels
769	# and each section at second level has a number for
770	# Title metadata
771	# also use Paged if gsdlthistype metadata is set to Paged
772	sub get_document_type {
773	my $self = shift (@_);
774	my ($doc_obj) = @_;
775
776	my $thistype = "VList";
777	my $childtype = "VList";
778	my $title;
779	my @tmp = ();
780
781	my $section = $doc_obj->get_top_section ();
782
783	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
784	if (defined $gsdlthistype) {
785	if ($gsdlthistype eq "Paged") {
786	$childtype = "Paged";
787	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
788	$thistype = "Paged";
789	} else {
790	$thistype = "Invisible";
791	}
792
793	return ($thistype, $childtype);
794	} elsif ($gsdlthistype eq "Hierarchy") {
795	return ($thistype, $childtype); # use VList, VList
796	}
797	}
798	my $first = 1;
799	while (defined $section) {
800	@tmp = split /\./, $section;
801	if (scalar(@tmp) > 1) {
802	return ($thistype, $childtype);
803	}
804	if (!$first) {
805	$title = $doc_obj->get_metadata_element ($section, "Title");
806	if (!defined $title \|\| $title !~ /^\d+$/) {
807	return ($thistype, $childtype);
808	}
809	}
810	$first = 0;
811	$section = $doc_obj->get_next_section($section);
812	}
813	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
814	$thistype = "Paged";
815	} else {
816	$thistype = "Invisible";
817	}
818	$childtype = "Paged";
819	return ($thistype, $childtype);
820	}
821
822	sub assoc_files
823	{
824	my $self = shift (@_);
825	my ($doc_obj, $archivedir) = @_;
826	my ($afile);
827
828	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
829	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
830	# if assoc file starts with a slash, we put it relative to the assoc
831	# dir, otherwise it is relative to the HASH... directory
832	if ($assoc_file->[1] =~ m@^[/\\]@) {
833	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
834	} else {
835	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
836	}
837	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
838	}
839	}
840

Note: See TracBrowser for help on using the repository browser.

Download in other formats: