source: trunk/gsdl/perllib/docsave.pm@ 8094

Last change on this file since 8094 was 8094, checked in by jrm21, 20 years ago

fix errors with uninitialised variables if 'saveas' not specified.

added 'use strict' to the top of the file so people won't use undeclared
variables.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use docproc;
38use util;
39
40
41sub BEGIN {
42 @docsave::ISA = ('docproc');
43}
44
45sub new {
46 my ($class, $collection, $archive_info, $verbosity,
47 $gzip, $groupsize, $outhandle) = @_;
48 my $self = new docproc ();
49
50
51 $groupsize=1 unless defined $groupsize;
52 $self->{'collection'} = $collection;
53 $self->{'archive_info'} = $archive_info;
54 $self->{'verbosity'} = $verbosity;
55 $self->{'gzip'} = $gzip;
56
57 $self->{'groupsize'} = $groupsize;
58 $self->{'gs_count'} = 0;
59
60 $self->{'outhandle'} = 'STDERR';
61 $self->{'outhandle'} = $outhandle if defined $outhandle;
62 # set a default for the archive directory
63 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
64
65 $self->{'sortmeta'} = undef;
66
67 return bless $self, $class;
68}
69
70sub setarchivedir {
71 my $self = shift (@_);
72 my ($archive_dir) = @_;
73
74 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
75 $self->{'archive_dir'} = $archive_dir;
76}
77
78sub set_sortmeta {
79 my $self = shift (@_);
80 my ($sortmeta) = @_;
81
82 $self->{'sortmeta'} = $sortmeta;
83}
84
85sub process {
86 my $self = shift (@_);
87 my ($doc_obj) = @_;
88
89 my $outhandle = $self->{'outhandle'};
90
91 if ($self->{'groupsize'} > 1) {
92 $self->group_process ($doc_obj);
93 return;
94 }
95
96 # groupsize is 1 (i.e. one document per XML file) so sortmeta
97 # may be used
98
99 my $OID = $doc_obj->get_OID();
100 $OID = "NULL" unless defined $OID;
101
102 # get document's directory
103 my $doc_dir = $self->get_doc_dir ($OID);
104
105
106 # copy all the associated files, add this information as metadata
107 # to the document
108 $self->process_assoc_files ($doc_obj, $doc_dir);
109
110 my $doc_file
111 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
112
113 #***define doctxt.xml file
114 my $doc_txt_file
115 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
116 my $working_dir
117 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
118
119 #***define docmets.xmlfile
120 my $doc_mets_file
121 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
122
123 my $short_doc_file;
124 my $save_as = $self->{'saveas'} || "GA";
125 if ($save_as eq "GA") {
126 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
127 } elsif ($save_as eq "METS") {
128 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
129 $short_doc_file=&util::filename_cat ($doc_dir, "docmets.xml");
130 } else {
131 return;
132 }
133 # save for later (for close_file_output())
134 $self->{'short_doc_file'}=$short_doc_file;
135
136 if ($save_as eq "GA") {
137 if (!open (OUTDOC, ">$doc_file")) {
138 print $outhandle "docsave::process could not write to file $doc_file\n";
139 return;
140 }
141
142 # save this document
143 $self->output_xml_header('docsave::OUTDOC');
144 $doc_obj->output_section('docsave::OUTDOC',
145 $doc_obj->get_top_section());
146 $self->output_xml_footer('docsave::OUTDOC');
147
148 close OUTDOC;
149 } elsif ($save_as eq "METS") {
150 # save the document without metadata:doctxt.xml
151
152 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
153 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
154 return;
155 }
156
157 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
158 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
159 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
160
161 # Convert doctxt.xml file to docmets.xml
162 if (!open(OUTDOC_METS,">$doc_mets_file")){
163 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
164 return;
165 }
166
167 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
168 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
169 $doc_obj->get_top_section(),
170 $working_dir);
171 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
172
173 close OUTDOC_TXT;
174 close OUTDOC_METS;
175 } else { # save_as isn't GA or METS
176 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
177 return;
178 }
179
180 if ($self->{'gzip'}) {
181 my $doc_file = $self->{'gs_filename'};
182 `gzip $doc_file`;
183 $doc_file .= ".gz";
184 $short_doc_file .= ".gz";
185 if (!-e $doc_file) {
186 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
187 return 0;
188 }
189 }
190
191 # do the sortmeta thing
192 my ($metadata); if (defined ($self->{'sortmeta'})) {
193 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
194 $self->{'sortmeta'});
195 }
196
197 # store reference in the archive_info
198 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
199}
200
201
202sub group_process {
203 my $self = shift (@_);
204 my ($doc_obj) = @_;
205
206 my $outhandle = $self->{'outhandle'};
207
208 my $OID = $doc_obj->get_OID();
209 $OID = "NULL" unless defined $OID;
210
211 my $groupsize = $self->{'groupsize'};
212 my $gs_count = $self->{'gs_count'};
213 my $open_new_file = (($gs_count % $groupsize)==0);
214
215 # opening a new file, or document has assoicated files => directory needed
216 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
217
218 # get document's directory
219 my $doc_dir = $self->get_doc_dir ($OID);
220
221 # copy all the associated files, add this information as metadata
222 # to the document
223 $self->process_assoc_files ($doc_obj, $doc_dir);
224
225
226 if ($open_new_file) {
227 # only if opening new file
228 my $doc_file
229 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
230 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
231
232 if ($gs_count>0)
233 {
234 return if (!$self->close_file_output());
235 }
236
237 if (!open (OUTDOC, ">$doc_file")) {
238 print $outhandle "docsave::group_process could not write to file $doc_file\n";
239 return;
240 }
241 $self->{'gs_filename'} = $doc_file;
242 $self->{'gs_short_filename'} = $short_doc_file;
243 $self->{'gs_OID'} = $OID;
244
245 $self->output_xml_header('docsave::OUTDOC');
246 }
247 }
248
249 # save this document
250 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
251
252 $self->{'gs_count'}++;
253}
254
255
256sub get_doc_dir {
257 my $self = shift (@_);
258 my ($OID) = @_;
259
260 my $doc_info = $self->{'archive_info'}->get_info($OID);
261 my $doc_dir = "";
262 if (defined $doc_info && scalar(@$doc_info) >= 1) {
263 # this OID already has an assigned directory, use the
264 # same one.
265 $doc_dir = $doc_info->[0];
266 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
267 } else {
268 # have to get a new document directory
269 my $doc_dir_rest = $OID;
270 my $doc_dir_num = 0;
271 do {
272 $doc_dir .= "/" if $doc_dir_num > 0;
273 if ($doc_dir_rest =~ s/^(.{1,8})//) {
274 $doc_dir .= $1;
275 $doc_dir_num++;
276 }
277 } while ($doc_dir_rest ne "" &&
278 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
279 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
280 $doc_dir .= ".dir";
281
282 }
283
284 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
285
286 return $doc_dir;
287}
288
289
290sub process_assoc_files {
291 my $self = shift (@_);
292 my ($doc_obj, $doc_dir) = @_;
293
294 my $outhandle = $self->{'outhandle'};
295
296 my @assoc_files = ();
297 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
298 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
299 $dir = "" unless defined $dir;
300 if (-e $assoc_file->[0]) {
301 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
302 &util::hard_link ($assoc_file->[0], $filepath);
303 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
304 "gsdlassocfile",
305 "$afile:$assoc_file->[2]:$dir");
306 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
307 "assocfilepath",
308 "$doc_dir");
309 } elsif ($self->{'verbosity'} > 2) {
310 print $outhandle "docsave::process couldn't copy the associated file " .
311 "$assoc_file->[0] to $afile\n";
312 }
313 }
314}
315
316
317sub close_file_output
318{
319 my ($self) = @_;
320
321 # make sure that the handle has been opened - it won't be if we failed
322 # to import any documents...
323 if (defined(fileno(docsave::OUTDOC))) {
324 $self->output_xml_footer('docsave::OUTDOC');
325 close OUTDOC;
326 }
327
328 my $OID = $self->{'gs_OID'};
329 my $short_doc_file;
330 # can we use 'short_doc_file' for GA too?
331 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
332 $short_doc_file=$self->{'short_doc_file'};
333 } else { # "GA"
334 $short_doc_file=$self->{'gs_short_filename'};
335 }
336
337 if ($self->{'gzip'}) {
338 my $doc_file = $self->{'gs_filename'};
339 `gzip $doc_file`;
340 $doc_file .= ".gz";
341 $short_doc_file .= ".gz";
342 if (!-e $doc_file) {
343 my $outhandle = $self->{'outhandle'};
344 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
345 return 0;
346 }
347 }
348
349 # store reference in the archive_info
350 $self->{'archive_info'}->add_info($OID, $short_doc_file);
351
352 return 1;
353}
354
355sub output_xml_header {
356 my $self = shift (@_);
357 my ($handle) = @_;
358
359 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
360
361 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
362 print $handle "<Archive>\n";
363}
364
365sub output_xml_footer {
366 my $self = shift (@_);
367 my ($handle) = @_;
368
369 print $handle "</Archive>\n";
370}
371
372sub output_txt_xml_header{
373 my $self = shift (@_);
374 my ($handle) = @_;
375 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
376 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
377}
378
379sub output_txt_xml_footer{
380 my $self = shift(@_);
381 my ($handle) = @_;
382 print $handle "<the end of the file>\n";
383}
384
385sub output_mets_xml_header(){
386 my $self = shift(@_);
387 my ($handle, $OID) = @_;
388 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
389 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
390 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
391}
392
393sub output_mets_xml_footer() {
394 my $self = shift(@_);
395 my ($handle) = @_;
396 print $handle '</mets:mets>' . "\n";
397}
398
3991;
400
401
402
Note: See TracBrowser for help on using the repository browser.