source: trunk/gsdl/perllib/docsave.pm@ 1287

Last change on this file since 1287 was 1287, checked in by sjboddie, 24 years ago

Implemented a -sortmeta option for import.pl to sort archives.inf file
(generated at end of import process) alphabetically by the given
metadata element. This may be useful for some collections as boolean
queries currently return matches in build (fairly random) order. Changing
the order of archives.inf changes the order that documents are built.
This option has a couple of important limitations:

  1. Can't be used in conjunction with the groupsize option as it would then only change the build order of groups of documents which doesn't seem very useful.
  2. Is of limited use when building indexes at a section level as the build order is only sorted by document, not by section.
  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection
28
29
30package docsave;
31
32use arcinfo;
33use docproc;
34use util;
35
36
37sub BEGIN {
38 @ISA = ('docproc');
39}
40
41sub new {
42 my ($class, $collection,$archive_info,$verbosity,$gzip,$groupsize) = @_;
43 my $self = new docproc ();
44
45
46 $groupsize=1 unless defined $groupsize;
47 $self->{'collection'} = $collection;
48 $self->{'archive_info'} = $archive_info;
49 $self->{'verbosity'} = $verbosity;
50 $self->{'gzip'} = $gzip;
51
52 $self->{'groupsize'} = $groupsize;
53 $self->{'gs_count'} = 0;
54
55 # set a default for the archive directory
56 $self->{'archive_dir'} = "$ENV{'GSDLHOME'}/collect/$self->{'collection'}/archives";
57
58 $self->{'sortmeta'} = undef;
59
60 return bless $self, $class;
61}
62
63sub setarchivedir {
64 my $self = shift (@_);
65 my ($archive_dir) = @_;
66
67 $self->{'archive_dir'} = $archive_dir;
68}
69
70sub set_sortmeta {
71 my $self = shift (@_);
72 my ($sortmeta) = @_;
73
74 $self->{'sortmeta'} = $sortmeta;
75}
76
77sub process {
78 my $self = shift (@_);
79 my ($doc_obj) = @_;
80
81 if ($self->{'groupsize'} > 1) {
82 $self->group_process ($doc_obj);
83
84 } else {
85 # groupsize is 1 (i.e. one document per GML file) so sortmeta
86 # may be used
87
88 my $OID = $doc_obj->get_OID();
89 $OID = "NULL" unless defined $OID;
90
91 # get document's directory
92 my $doc_dir = $self->get_doc_dir ($OID);
93
94 # copy all the associated files, add this information as metadata
95 # to the document
96 $self->process_assoc_files ($doc_obj, $doc_dir);
97
98 my $doc_file
99 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
100 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
101
102 if (!open (OUTDOC, ">$doc_file")) {
103 print STDERR "docsave::process could not write to file $doc_file\n";
104 return;
105 }
106
107 # save this document
108 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
109 close OUTDOC;
110
111 if ($self->{'gzip'}) {
112 my $doc_file = $self->{'gs_filename'};
113 `gzip $doc_file`;
114 $doc_file .= ".gz";
115 $short_doc_file .= ".gz";
116 if (!-e $doc_file) {
117 print STDERR "error while gzipping: $doc_file doesn't exist\n";
118 return 0;
119 }
120 }
121
122 # do the sortmeta thing
123 my ($metadata);
124 if (defined ($self->{'sortmeta'})) {
125 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
126 }
127
128 # store reference in the archive_info
129 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
130 }
131}
132
133sub group_process {
134 my $self = shift (@_);
135 my ($doc_obj) = @_;
136
137 my $OID = $doc_obj->get_OID();
138 $OID = "NULL" unless defined $OID;
139
140 my $groupsize = $self->{'groupsize'};
141 my $gs_count = $self->{'gs_count'};
142 my $open_new_file = (($gs_count % $groupsize)==0);
143
144 # opening a new file, or document has assoicated files => directory needed
145 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
146
147 # get document's directory
148 my $doc_dir = $self->get_doc_dir ($OID);
149
150 # copy all the associated files, add this information as metadata
151 # to the document
152 $self->process_assoc_files ($doc_obj, $doc_dir);
153
154
155 if ($open_new_file) {
156 # only if opening new file
157 my $doc_file
158 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
159 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
160
161 if ($gs_count>0)
162 {
163 return if (!$self->close_file_output());
164 }
165
166 if (!open (OUTDOC, ">$doc_file")) {
167 print STDERR "docsave::group_process could not write to file $doc_file\n";
168 return;
169 }
170 $self->{'gs_filename'} = $doc_file;
171 $self->{'gs_short_filename'} = $short_doc_file;
172 $self->{'gs_OID'} = $OID;
173 }
174 }
175
176 # save this document
177 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
178
179 $self->{'gs_count'}++;
180}
181
182
183sub get_doc_dir {
184 my $self = shift (@_);
185 my ($OID) = @_;
186
187 my $doc_info = $self->{'archive_info'}->get_info($OID);
188 my $doc_dir = "";
189 if (defined $doc_info && scalar(@$doc_info) >= 1) {
190 # this OID already has an assigned directory, use the
191 # same one.
192 $doc_dir = $doc_info->[0];
193 $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
194 } else {
195 # have to get a new document directory
196 my $doc_dir_rest = $OID;
197 my $doc_dir_num = 0;
198 do {
199 $doc_dir .= "/" if $doc_dir_num > 0;
200 if ($doc_dir_rest =~ s/^(.{1,8})//) {
201 $doc_dir .= $1;
202 $doc_dir_num++;
203 }
204 } while ($doc_dir_rest ne "" &&
205 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
206 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
207 $doc_dir .= ".dir";
208
209 }
210
211 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
212
213 return $doc_dir;
214}
215
216
217sub process_assoc_files {
218 my $self = shift (@_);
219 my ($doc_obj, $doc_dir) = @_;
220
221 my @assoc_files = ();
222 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
223 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
224 $dir = "" unless defined $dir;
225 if (-e $assoc_file->[0]) {
226 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
227 &util::hard_link ($assoc_file->[0], $filepath);
228 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
229 "gsdlassocfile",
230 "$afile:$assoc_file->[2]:$dir");
231 } else {
232 print STDERR "docsave::process couldn't copy the associated file " .
233 "$assoc_file->[0] to $afile\n";
234 }
235 }
236}
237
238
239sub close_file_output
240{
241 my ($self) = @_;
242
243 close OUTDOC;
244
245 my $OID = $self->{'gs_OID'};
246 my $short_doc_file = $self->{'gs_short_filename'};
247
248 if ($self->{'gzip'}) {
249 my $doc_file = $self->{'gs_filename'};
250 `gzip $doc_file`;
251 $doc_file .= ".gz";
252 $short_doc_file .= ".gz";
253 if (!-e $doc_file) {
254 print STDERR "error while gzipping: $doc_file doesn't exist\n";
255 return 0;
256 }
257 }
258
259 # store reference in the archive_info
260 $self->{'archive_info'}->add_info($OID, $short_doc_file);
261
262 return 1;
263}
264
2651;
Note: See TracBrowser for help on using the repository browser.