source: trunk/gsdl/perllib/docsave.pm@ 7645

Last change on this file since 7645 was 3834, checked in by sjboddie, 21 years ago

Prevent "use bytes" from causing errors for older perls

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29
30package docsave;
31
32eval {require bytes};
33
34use arcinfo;
35use docproc;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('docproc');
41}
42
43sub new {
44 my ($class, $collection, $archive_info, $verbosity,
45 $gzip, $groupsize, $outhandle) = @_;
46 my $self = new docproc ();
47
48
49 $groupsize=1 unless defined $groupsize;
50 $self->{'collection'} = $collection;
51 $self->{'archive_info'} = $archive_info;
52 $self->{'verbosity'} = $verbosity;
53 $self->{'gzip'} = $gzip;
54
55 $self->{'groupsize'} = $groupsize;
56 $self->{'gs_count'} = 0;
57
58 $self->{'outhandle'} = STDERR;
59 $self->{'outhandle'} = $outhandle if defined $outhandle;
60 # set a default for the archive directory
61 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
62
63 $self->{'sortmeta'} = undef;
64
65 return bless $self, $class;
66}
67
68sub setarchivedir {
69 my $self = shift (@_);
70 my ($archive_dir) = @_;
71
72 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
73 $self->{'archive_dir'} = $archive_dir;
74}
75
76sub set_sortmeta {
77 my $self = shift (@_);
78 my ($sortmeta) = @_;
79
80 $self->{'sortmeta'} = $sortmeta;
81}
82
83sub process {
84 my $self = shift (@_);
85 my ($doc_obj) = @_;
86
87 my $outhandle = $self->{'outhandle'};
88
89 if ($self->{'groupsize'} > 1) {
90 $self->group_process ($doc_obj);
91
92 } else {
93 # groupsize is 1 (i.e. one document per XML file) so sortmeta
94 # may be used
95
96 my $OID = $doc_obj->get_OID();
97 $OID = "NULL" unless defined $OID;
98
99 # get document's directory
100 my $doc_dir = $self->get_doc_dir ($OID);
101
102 # copy all the associated files, add this information as metadata
103 # to the document
104 $self->process_assoc_files ($doc_obj, $doc_dir);
105
106 my $doc_file
107 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
108 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
109
110 if (!open (OUTDOC, ">$doc_file")) {
111 print $outhandle "docsave::process could not write to file $doc_file\n";
112 return;
113 }
114
115 # save this document
116 $self->output_xml_header('docsave::OUTDOC');
117 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
118 $self->output_xml_footer('docsave::OUTDOC');
119 close OUTDOC;
120
121 if ($self->{'gzip'}) {
122 my $doc_file = $self->{'gs_filename'};
123 `gzip $doc_file`;
124 $doc_file .= ".gz";
125 $short_doc_file .= ".gz";
126 if (!-e $doc_file) {
127 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
128 return 0;
129 }
130 }
131
132 # do the sortmeta thing
133 my ($metadata);
134 if (defined ($self->{'sortmeta'})) {
135 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
136 }
137
138 # store reference in the archive_info
139 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
140 }
141}
142
143sub group_process {
144 my $self = shift (@_);
145 my ($doc_obj) = @_;
146
147 my $outhandle = $self->{'outhandle'};
148
149 my $OID = $doc_obj->get_OID();
150 $OID = "NULL" unless defined $OID;
151
152 my $groupsize = $self->{'groupsize'};
153 my $gs_count = $self->{'gs_count'};
154 my $open_new_file = (($gs_count % $groupsize)==0);
155
156 # opening a new file, or document has assoicated files => directory needed
157 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
158
159 # get document's directory
160 my $doc_dir = $self->get_doc_dir ($OID);
161
162 # copy all the associated files, add this information as metadata
163 # to the document
164 $self->process_assoc_files ($doc_obj, $doc_dir);
165
166
167 if ($open_new_file) {
168 # only if opening new file
169 my $doc_file
170 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
171 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
172
173 if ($gs_count>0)
174 {
175 return if (!$self->close_file_output());
176 }
177
178 if (!open (OUTDOC, ">$doc_file")) {
179 print $outhandle "docsave::group_process could not write to file $doc_file\n";
180 return;
181 }
182 $self->{'gs_filename'} = $doc_file;
183 $self->{'gs_short_filename'} = $short_doc_file;
184 $self->{'gs_OID'} = $OID;
185
186 $self->output_xml_header('docsave::OUTDOC');
187 }
188 }
189
190 # save this document
191 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
192
193 $self->{'gs_count'}++;
194}
195
196
197sub get_doc_dir {
198 my $self = shift (@_);
199 my ($OID) = @_;
200
201 my $doc_info = $self->{'archive_info'}->get_info($OID);
202 my $doc_dir = "";
203 if (defined $doc_info && scalar(@$doc_info) >= 1) {
204 # this OID already has an assigned directory, use the
205 # same one.
206 $doc_dir = $doc_info->[0];
207 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
208 } else {
209 # have to get a new document directory
210 my $doc_dir_rest = $OID;
211 my $doc_dir_num = 0;
212 do {
213 $doc_dir .= "/" if $doc_dir_num > 0;
214 if ($doc_dir_rest =~ s/^(.{1,8})//) {
215 $doc_dir .= $1;
216 $doc_dir_num++;
217 }
218 } while ($doc_dir_rest ne "" &&
219 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
220 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
221 $doc_dir .= ".dir";
222
223 }
224
225 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
226
227 return $doc_dir;
228}
229
230
231sub process_assoc_files {
232 my $self = shift (@_);
233 my ($doc_obj, $doc_dir) = @_;
234
235 my $outhandle = $self->{'outhandle'};
236
237 my @assoc_files = ();
238 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
239 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
240 $dir = "" unless defined $dir;
241 if (-e $assoc_file->[0]) {
242 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
243 &util::hard_link ($assoc_file->[0], $filepath);
244 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
245 "gsdlassocfile",
246 "$afile:$assoc_file->[2]:$dir");
247 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
248 "assocfilepath",
249 "$doc_dir");
250 } elsif ($self->{'verbosity'} > 2) {
251 print $outhandle "docsave::process couldn't copy the associated file " .
252 "$assoc_file->[0] to $afile\n";
253 }
254 }
255}
256
257
258sub close_file_output
259{
260 my ($self) = @_;
261
262 # make sure that the handle has been opened - it won't be if we failed
263 # to import any documents...
264 if (defined(fileno(docsave::OUTDOC))) {
265 $self->output_xml_footer('docsave::OUTDOC');
266 close OUTDOC;
267 }
268
269 my $OID = $self->{'gs_OID'};
270 my $short_doc_file = $self->{'gs_short_filename'};
271
272 if ($self->{'gzip'}) {
273 my $doc_file = $self->{'gs_filename'};
274 `gzip $doc_file`;
275 $doc_file .= ".gz";
276 $short_doc_file .= ".gz";
277 if (!-e $doc_file) {
278 my $outhandle = $self->{'outhandle'};
279 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
280 return 0;
281 }
282 }
283
284 # store reference in the archive_info
285 $self->{'archive_info'}->add_info($OID, $short_doc_file);
286
287 return 1;
288}
289
290sub output_xml_header {
291 my $self = shift (@_);
292 my ($handle) = @_;
293
294 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
295 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
296 print $handle "<Archive>\n";
297}
298
299sub output_xml_footer {
300 my $self = shift (@_);
301 my ($handle) = @_;
302
303 print $handle "</Archive>\n";
304}
305
3061;
Note: See TracBrowser for help on using the repository browser.