source: trunk/gsdl/perllib/docsave.pm@ 2925

Last change on this file since 2925 was 2925, checked in by sjboddie, 22 years ago

Altered the format of the GreenstoneArchive and GreenstoneDirectoryMetadata
XML files slightly (they're now called Archive and DirectoryMetadata
respectively).

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29
30package docsave;
31
32use arcinfo;
33use docproc;
34use util;
35
36
37sub BEGIN {
38 @ISA = ('docproc');
39}
40
41sub new {
42 my ($class, $collection, $archive_info, $verbosity,
43 $gzip, $groupsize, $outhandle) = @_;
44 my $self = new docproc ();
45
46
47 $groupsize=1 unless defined $groupsize;
48 $self->{'collection'} = $collection;
49 $self->{'archive_info'} = $archive_info;
50 $self->{'verbosity'} = $verbosity;
51 $self->{'gzip'} = $gzip;
52
53 $self->{'groupsize'} = $groupsize;
54 $self->{'gs_count'} = 0;
55
56 $self->{'outhandle'} = STDERR;
57 $self->{'outhandle'} = $outhandle if defined $outhandle;
58 # set a default for the archive directory
59 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
60
61 $self->{'sortmeta'} = undef;
62
63 return bless $self, $class;
64}
65
66sub setarchivedir {
67 my $self = shift (@_);
68 my ($archive_dir) = @_;
69
70 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
71 $self->{'archive_dir'} = $archive_dir;
72}
73
74sub set_sortmeta {
75 my $self = shift (@_);
76 my ($sortmeta) = @_;
77
78 $self->{'sortmeta'} = $sortmeta;
79}
80
81sub process {
82 my $self = shift (@_);
83 my ($doc_obj) = @_;
84
85 my $outhandle = $self->{'outhandle'};
86
87 if ($self->{'groupsize'} > 1) {
88 $self->group_process ($doc_obj);
89
90 } else {
91 # groupsize is 1 (i.e. one document per XML file) so sortmeta
92 # may be used
93
94 my $OID = $doc_obj->get_OID();
95 $OID = "NULL" unless defined $OID;
96
97 # get document's directory
98 my $doc_dir = $self->get_doc_dir ($OID);
99
100 # copy all the associated files, add this information as metadata
101 # to the document
102 $self->process_assoc_files ($doc_obj, $doc_dir);
103
104 my $doc_file
105 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
106 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
107
108 if (!open (OUTDOC, ">$doc_file")) {
109 print $outhandle "docsave::process could not write to file $doc_file\n";
110 return;
111 }
112
113 # save this document
114 $self->output_xml_header('docsave::OUTDOC');
115 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
116 $self->output_xml_footer('docsave::OUTDOC');
117 close OUTDOC;
118
119 if ($self->{'gzip'}) {
120 my $doc_file = $self->{'gs_filename'};
121 `gzip $doc_file`;
122 $doc_file .= ".gz";
123 $short_doc_file .= ".gz";
124 if (!-e $doc_file) {
125 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
126 return 0;
127 }
128 }
129
130 # do the sortmeta thing
131 my ($metadata);
132 if (defined ($self->{'sortmeta'})) {
133 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
134 }
135
136 # store reference in the archive_info
137 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
138 }
139}
140
141sub group_process {
142 my $self = shift (@_);
143 my ($doc_obj) = @_;
144
145 my $outhandle = $self->{'outhandle'};
146
147 my $OID = $doc_obj->get_OID();
148 $OID = "NULL" unless defined $OID;
149
150 my $groupsize = $self->{'groupsize'};
151 my $gs_count = $self->{'gs_count'};
152 my $open_new_file = (($gs_count % $groupsize)==0);
153
154 # opening a new file, or document has assoicated files => directory needed
155 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
156
157 # get document's directory
158 my $doc_dir = $self->get_doc_dir ($OID);
159
160 # copy all the associated files, add this information as metadata
161 # to the document
162 $self->process_assoc_files ($doc_obj, $doc_dir);
163
164
165 if ($open_new_file) {
166 # only if opening new file
167 my $doc_file
168 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
169 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
170
171 if ($gs_count>0)
172 {
173 return if (!$self->close_file_output());
174 }
175
176 if (!open (OUTDOC, ">$doc_file")) {
177 print $outhandle "docsave::group_process could not write to file $doc_file\n";
178 return;
179 }
180 $self->{'gs_filename'} = $doc_file;
181 $self->{'gs_short_filename'} = $short_doc_file;
182 $self->{'gs_OID'} = $OID;
183
184 $self->output_xml_header('docsave::OUTDOC');
185 }
186 }
187
188 # save this document
189 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
190
191 $self->{'gs_count'}++;
192}
193
194
195sub get_doc_dir {
196 my $self = shift (@_);
197 my ($OID) = @_;
198
199 my $doc_info = $self->{'archive_info'}->get_info($OID);
200 my $doc_dir = "";
201 if (defined $doc_info && scalar(@$doc_info) >= 1) {
202 # this OID already has an assigned directory, use the
203 # same one.
204 $doc_dir = $doc_info->[0];
205 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
206 } else {
207 # have to get a new document directory
208 my $doc_dir_rest = $OID;
209 my $doc_dir_num = 0;
210 do {
211 $doc_dir .= "/" if $doc_dir_num > 0;
212 if ($doc_dir_rest =~ s/^(.{1,8})//) {
213 $doc_dir .= $1;
214 $doc_dir_num++;
215 }
216 } while ($doc_dir_rest ne "" &&
217 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
218 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
219 $doc_dir .= ".dir";
220
221 }
222
223 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
224
225 return $doc_dir;
226}
227
228
229sub process_assoc_files {
230 my $self = shift (@_);
231 my ($doc_obj, $doc_dir) = @_;
232
233 my $outhandle = $self->{'outhandle'};
234
235 my @assoc_files = ();
236 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
237 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
238 $dir = "" unless defined $dir;
239 if (-e $assoc_file->[0]) {
240 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
241 &util::hard_link ($assoc_file->[0], $filepath);
242 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
243 "gsdlassocfile",
244 "$afile:$assoc_file->[2]:$dir");
245 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
246 "assocfilepath",
247 "$doc_dir");
248 } elsif ($self->{'verbosity'} > 2) {
249 print $outhandle "docsave::process couldn't copy the associated file " .
250 "$assoc_file->[0] to $afile\n";
251 }
252 }
253}
254
255
256sub close_file_output
257{
258 my ($self) = @_;
259
260 $self->output_xml_footer('docsave::OUTDOC');
261 close OUTDOC;
262
263 my $OID = $self->{'gs_OID'};
264 my $short_doc_file = $self->{'gs_short_filename'};
265
266 if ($self->{'gzip'}) {
267 my $doc_file = $self->{'gs_filename'};
268 `gzip $doc_file`;
269 $doc_file .= ".gz";
270 $short_doc_file .= ".gz";
271 if (!-e $doc_file) {
272 my $outhandle = $self->{'outhandle'};
273 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
274 return 0;
275 }
276 }
277
278 # store reference in the archive_info
279 $self->{'archive_info'}->add_info($OID, $short_doc_file);
280
281 return 1;
282}
283
284sub output_xml_header {
285 my $self = shift (@_);
286 my ($handle) = @_;
287
288 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
289 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
290 print $handle "<Archive>\n";
291}
292
293sub output_xml_footer {
294 my $self = shift (@_);
295 my ($handle) = @_;
296
297 print $handle "</Archive>\n";
298}
299
3001;
Note: See TracBrowser for help on using the repository browser.