source: main/tags/2.51-fiji/gsdl/perllib/docsave.pm@ 24574

Last change on this file since 24574 was 8079, checked in by davidb, 20 years ago

docsave.pm had been saving both GA and METS format. if-statement added to
choose one or other based on -saveas flag.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29
30package docsave;
31
32eval {require bytes};
33
34use arcinfo;
35use docproc;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('docproc');
41}
42
43sub new {
44 my ($class, $collection, $archive_info, $verbosity,
45 $gzip, $groupsize, $outhandle) = @_;
46 my $self = new docproc ();
47
48
49 $groupsize=1 unless defined $groupsize;
50 $self->{'collection'} = $collection;
51 $self->{'archive_info'} = $archive_info;
52 $self->{'verbosity'} = $verbosity;
53 $self->{'gzip'} = $gzip;
54
55 $self->{'groupsize'} = $groupsize;
56 $self->{'gs_count'} = 0;
57
58 $self->{'outhandle'} = STDERR;
59 $self->{'outhandle'} = $outhandle if defined $outhandle;
60 # set a default for the archive directory
61 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
62
63 $self->{'sortmeta'} = undef;
64
65 return bless $self, $class;
66}
67
68sub setarchivedir {
69 my $self = shift (@_);
70 my ($archive_dir) = @_;
71
72 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
73 $self->{'archive_dir'} = $archive_dir;
74}
75
76sub set_sortmeta {
77 my $self = shift (@_);
78 my ($sortmeta) = @_;
79
80 $self->{'sortmeta'} = $sortmeta;
81}
82
83sub process {
84 my $self = shift (@_);
85 my ($doc_obj) = @_;
86
87 my $outhandle = $self->{'outhandle'};
88
89 if ($self->{'groupsize'} > 1) {
90 $self->group_process ($doc_obj);
91
92 } else {
93 # groupsize is 1 (i.e. one document per XML file) so sortmeta
94 # may be used
95
96 my $OID = $doc_obj->get_OID();
97 $OID = "NULL" unless defined $OID;
98
99
100 # get document's directory
101 my $doc_dir = $self->get_doc_dir ($OID);
102
103
104 # copy all the associated files, add this information as metadata
105 # to the document
106 $self->process_assoc_files ($doc_obj, $doc_dir);
107
108 my $doc_file
109 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
110
111 #***define doctxt.xml file
112 my $doc_txt_file
113 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
114 my $working_dir
115 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
116
117 #***define docmets.xmlfile
118 my $doc_mets_file
119 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
120
121 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
122 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
123 my $short_mets_doc_file=&util::filename_cat ($doc_dir, "docmets.xml");
124
125 my $save_as = $self->{'saveas'};
126
127 if ($save_as eq "GA") {
128 if (!open (OUTDOC, ">$doc_file")) {
129 print $outhandle "docsave::process could not write to file $doc_file\n";
130 return;
131 }
132
133 # save this document
134 $self->output_xml_header('docsave::OUTDOC');
135 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
136 $self->output_xml_footer('docsave::OUTDOC');
137
138 close OUTDOC;
139 }
140 elsif ($save_as eq "METS") {
141 # save the document without metadata:doctxt.xml
142
143 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
144 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
145 return;
146 }
147
148 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
149 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
150 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
151
152 # Convert doctxt.xml file to docmets.xml
153 if (!open(OUTDOC_METS,">$doc_mets_file")){
154 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
155 return;
156 }
157
158 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
159 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$doc_obj->get_top_section(), $working_dir);
160 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
161
162 close OUTDOC_TXT;
163 close OUTDOC_METS;
164 }
165 else {
166 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
167 return;
168 }
169
170 if ($self->{'gzip'}) {
171 my $doc_file = $self->{'gs_filename'};
172 `gzip $doc_file`;
173 $doc_file .= ".gz";
174 $short_doc_file .= ".gz";
175 if (!-e $doc_file) {
176 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
177 return 0;
178 }
179 }
180
181 # do the sortmeta thing
182 my ($metadata); if (defined ($self->{'sortmeta'})) {
183 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
184 }
185
186 # store reference in the archive_info
187 if ($self->{'saveas'} eq "METS"){
188 $self->{'archive_info'}->add_info($OID, $short_mets_doc_file, $metadata);
189 } else {
190 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
191 }
192 }
193}
194
195
196sub group_process {
197 my $self = shift (@_);
198 my ($doc_obj) = @_;
199
200 my $outhandle = $self->{'outhandle'};
201
202 my $OID = $doc_obj->get_OID();
203 $OID = "NULL" unless defined $OID;
204
205 my $groupsize = $self->{'groupsize'};
206 my $gs_count = $self->{'gs_count'};
207 my $open_new_file = (($gs_count % $groupsize)==0);
208
209 # opening a new file, or document has assoicated files => directory needed
210 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
211
212 # get document's directory
213 my $doc_dir = $self->get_doc_dir ($OID);
214
215 # copy all the associated files, add this information as metadata
216 # to the document
217 $self->process_assoc_files ($doc_obj, $doc_dir);
218
219
220 if ($open_new_file) {
221 # only if opening new file
222 my $doc_file
223 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
224 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
225
226 if ($gs_count>0)
227 {
228 return if (!$self->close_file_output());
229 }
230
231 if (!open (OUTDOC, ">$doc_file")) {
232 print $outhandle "docsave::group_process could not write to file $doc_file\n";
233 return;
234 }
235 $self->{'gs_filename'} = $doc_file;
236 $self->{'gs_short_filename'} = $short_doc_file;
237 $self->{'gs_OID'} = $OID;
238
239 $self->output_xml_header('docsave::OUTDOC');
240 }
241 }
242
243 # save this document
244 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
245
246 $self->{'gs_count'}++;
247}
248
249
250sub get_doc_dir {
251 my $self = shift (@_);
252 my ($OID) = @_;
253
254 my $doc_info = $self->{'archive_info'}->get_info($OID);
255 my $doc_dir = "";
256 if (defined $doc_info && scalar(@$doc_info) >= 1) {
257 # this OID already has an assigned directory, use the
258 # same one.
259 $doc_dir = $doc_info->[0];
260 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
261 } else {
262 # have to get a new document directory
263 my $doc_dir_rest = $OID;
264 my $doc_dir_num = 0;
265 do {
266 $doc_dir .= "/" if $doc_dir_num > 0;
267 if ($doc_dir_rest =~ s/^(.{1,8})//) {
268 $doc_dir .= $1;
269 $doc_dir_num++;
270 }
271 } while ($doc_dir_rest ne "" &&
272 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
273 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
274 $doc_dir .= ".dir";
275
276 }
277
278 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
279
280 return $doc_dir;
281}
282
283
284sub process_assoc_files {
285 my $self = shift (@_);
286 my ($doc_obj, $doc_dir) = @_;
287
288 my $outhandle = $self->{'outhandle'};
289
290 my @assoc_files = ();
291 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
292 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
293 $dir = "" unless defined $dir;
294 if (-e $assoc_file->[0]) {
295 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
296 &util::hard_link ($assoc_file->[0], $filepath);
297 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
298 "gsdlassocfile",
299 "$afile:$assoc_file->[2]:$dir");
300 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
301 "assocfilepath",
302 "$doc_dir");
303 } elsif ($self->{'verbosity'} > 2) {
304 print $outhandle "docsave::process couldn't copy the associated file " .
305 "$assoc_file->[0] to $afile\n";
306 }
307 }
308}
309
310
311sub close_file_output
312{
313 my ($self) = @_;
314
315 # make sure that the handle has been opened - it won't be if we failed
316 # to import any documents...
317 if (defined(fileno(docsave::OUTDOC))) {
318 $self->output_xml_footer('docsave::OUTDOC');
319 close OUTDOC;
320 }
321
322 my $OID = $self->{'gs_OID'};
323 my $short_doc_file = $self->{'gs_short_filename'};
324
325 if ($self->{'gzip'}) {
326 my $doc_file = $self->{'gs_filename'};
327 `gzip $doc_file`;
328 $doc_file .= ".gz";
329 $short_doc_file .= ".gz";
330 if (!-e $doc_file) {
331 my $outhandle = $self->{'outhandle'};
332 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
333 return 0;
334 }
335 }
336
337 # store reference in the archive_info
338 if ($self->{'saveas'} eq "METS"){
339 $self->{'archive_info'}->add_info($OID, $short_mets_doc_file);
340 } else {
341 $self->{'archive_info'}->add_info($OID, $short_doc_file);
342 }
343
344 return 1;
345}
346
347sub output_xml_header {
348 my $self = shift (@_);
349 my ($handle) = @_;
350
351 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
352
353 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
354 print $handle "<Archive>\n";
355}
356
357sub output_xml_footer {
358 my $self = shift (@_);
359 my ($handle) = @_;
360
361 print $handle "</Archive>\n";
362}
363
364sub output_txt_xml_header{
365 my $self = shift (@_);
366 my ($handle) = @_;
367 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
368 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
369}
370
371sub output_txt_xml_footer{
372 my $self = shift(@_);
373 my ($handle) = @_;
374 print $handle "<the end of the file>\n";
375}
376
377sub output_mets_xml_header(){
378 my $self = shift(@_);
379 my ($handle, $OID) = @_;
380 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
381 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
382 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
383}
384
385sub output_mets_xml_footer() {
386 my $self = shift(@_);
387 my ($handle) = @_;
388 print $handle '</mets:mets>' . "\n";
389}
390
3911;
392
393
394
Note: See TracBrowser for help on using the repository browser.