source: trunk/gsdl/perllib/docsave.pm@ 7902

Last change on this file since 7902 was 7902, checked in by chi, 20 years ago

Saving of documents (in archive format) extended to generate METS format
as alternative to GreenstoneArchive (GA) format. Controlled through
'import.pl -saveas METS ...'

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29
30package docsave;
31
32eval {require bytes};
33
34use arcinfo;
35use docproc;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('docproc');
41}
42
43sub new {
44 my ($class, $collection, $archive_info, $verbosity,
45 $gzip, $groupsize, $outhandle) = @_;
46 my $self = new docproc ();
47
48
49 $groupsize=1 unless defined $groupsize;
50 $self->{'collection'} = $collection;
51 $self->{'archive_info'} = $archive_info;
52 $self->{'verbosity'} = $verbosity;
53 $self->{'gzip'} = $gzip;
54
55 $self->{'groupsize'} = $groupsize;
56 $self->{'gs_count'} = 0;
57
58 $self->{'outhandle'} = STDERR;
59 $self->{'outhandle'} = $outhandle if defined $outhandle;
60 # set a default for the archive directory
61 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
62
63 $self->{'sortmeta'} = undef;
64
65 return bless $self, $class;
66}
67
68sub setarchivedir {
69 my $self = shift (@_);
70 my ($archive_dir) = @_;
71
72 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
73 $self->{'archive_dir'} = $archive_dir;
74}
75
76sub set_sortmeta {
77 my $self = shift (@_);
78 my ($sortmeta) = @_;
79
80 $self->{'sortmeta'} = $sortmeta;
81}
82
83sub process {
84 my $self = shift (@_);
85 my ($doc_obj) = @_;
86
87 my $outhandle = $self->{'outhandle'};
88
89 if ($self->{'groupsize'} > 1) {
90 $self->group_process ($doc_obj);
91
92 } else {
93 # groupsize is 1 (i.e. one document per XML file) so sortmeta
94 # may be used
95
96 my $OID = $doc_obj->get_OID();
97 $OID = "NULL" unless defined $OID;
98
99
100 # get document's directory
101 my $doc_dir = $self->get_doc_dir ($OID);
102
103
104 # copy all the associated files, add this information as metadata
105 # to the document
106 $self->process_assoc_files ($doc_obj, $doc_dir);
107
108 my $doc_file
109 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
110
111 #***define doctxt.xml file
112 my $doc_txt_file
113 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
114 my $working_dir
115 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
116
117 #***define docmets.xmlfile
118 my $doc_mets_file
119 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
120
121 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
122 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
123 my $short_mets_doc_file=&util::filename_cat ($doc_dir, "docmets.xml");
124
125
126 if (!open (OUTDOC, ">$doc_file")) {
127 print $outhandle "docsave::process could not write to file $doc_file\n";
128 return;
129 }
130
131 # save this document
132 $self->output_xml_header('docsave::OUTDOC');
133 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
134 $self->output_xml_footer('docsave::OUTDOC');
135
136 #***save the document without metadata:doctxt.xml: Chi-Yu Huang
137
138 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
139 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
140 return;
141 }
142
143 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
144 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
145 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
146
147 #***Convert doctxt.xml file to docmets.xml: Chi-Yu Huang
148 if (!open(OUTDOC_METS,">$doc_mets_file")){
149 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
150 return;
151 }
152
153 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
154 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$doc_obj->get_top_section(), $working_dir);
155 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
156
157 close OUTDOC;
158 close OUTDOC_TXT;
159 close OUTDOC_METS;
160
161 if ($self->{'gzip'}) {
162 my $doc_file = $self->{'gs_filename'};
163 `gzip $doc_file`;
164 $doc_file .= ".gz";
165 $short_doc_file .= ".gz";
166 if (!-e $doc_file) {
167 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
168 return 0;
169 }
170 }
171
172 # do the sortmeta thing
173 my ($metadata); if (defined ($self->{'sortmeta'})) {
174 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
175 }
176
177 # store reference in the archive_info
178 if ($self->{'saveas'} eq "METS"){
179 $self->{'archive_info'}->add_info($OID, $short_mets_doc_file, $metadata);
180 } else {
181 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
182 }
183 }
184}
185
186
187sub group_process {
188 my $self = shift (@_);
189 my ($doc_obj) = @_;
190
191 my $outhandle = $self->{'outhandle'};
192
193 my $OID = $doc_obj->get_OID();
194 $OID = "NULL" unless defined $OID;
195
196 my $groupsize = $self->{'groupsize'};
197 my $gs_count = $self->{'gs_count'};
198 my $open_new_file = (($gs_count % $groupsize)==0);
199
200 # opening a new file, or document has assoicated files => directory needed
201 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
202
203 # get document's directory
204 my $doc_dir = $self->get_doc_dir ($OID);
205
206 # copy all the associated files, add this information as metadata
207 # to the document
208 $self->process_assoc_files ($doc_obj, $doc_dir);
209
210
211 if ($open_new_file) {
212 # only if opening new file
213 my $doc_file
214 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
215 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
216
217 if ($gs_count>0)
218 {
219 return if (!$self->close_file_output());
220 }
221
222 if (!open (OUTDOC, ">$doc_file")) {
223 print $outhandle "docsave::group_process could not write to file $doc_file\n";
224 return;
225 }
226 $self->{'gs_filename'} = $doc_file;
227 $self->{'gs_short_filename'} = $short_doc_file;
228 $self->{'gs_OID'} = $OID;
229
230 $self->output_xml_header('docsave::OUTDOC');
231 }
232 }
233
234 # save this document
235 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
236
237 $self->{'gs_count'}++;
238}
239
240
241sub get_doc_dir {
242 my $self = shift (@_);
243 my ($OID) = @_;
244
245 my $doc_info = $self->{'archive_info'}->get_info($OID);
246 my $doc_dir = "";
247 if (defined $doc_info && scalar(@$doc_info) >= 1) {
248 # this OID already has an assigned directory, use the
249 # same one.
250 $doc_dir = $doc_info->[0];
251 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
252 } else {
253 # have to get a new document directory
254 my $doc_dir_rest = $OID;
255 my $doc_dir_num = 0;
256 do {
257 $doc_dir .= "/" if $doc_dir_num > 0;
258 if ($doc_dir_rest =~ s/^(.{1,8})//) {
259 $doc_dir .= $1;
260 $doc_dir_num++;
261 }
262 } while ($doc_dir_rest ne "" &&
263 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
264 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
265 $doc_dir .= ".dir";
266
267 }
268
269 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
270
271 return $doc_dir;
272}
273
274
275sub process_assoc_files {
276 my $self = shift (@_);
277 my ($doc_obj, $doc_dir) = @_;
278
279 my $outhandle = $self->{'outhandle'};
280
281 my @assoc_files = ();
282 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
283 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
284 $dir = "" unless defined $dir;
285 if (-e $assoc_file->[0]) {
286 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
287 &util::hard_link ($assoc_file->[0], $filepath);
288 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
289 "gsdlassocfile",
290 "$afile:$assoc_file->[2]:$dir");
291 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
292 "assocfilepath",
293 "$doc_dir");
294 } elsif ($self->{'verbosity'} > 2) {
295 print $outhandle "docsave::process couldn't copy the associated file " .
296 "$assoc_file->[0] to $afile\n";
297 }
298 }
299}
300
301
302sub close_file_output
303{
304 my ($self) = @_;
305
306 # make sure that the handle has been opened - it won't be if we failed
307 # to import any documents...
308 if (defined(fileno(docsave::OUTDOC))) {
309 $self->output_xml_footer('docsave::OUTDOC');
310 close OUTDOC;
311 }
312
313 my $OID = $self->{'gs_OID'};
314 my $short_doc_file = $self->{'gs_short_filename'};
315
316 if ($self->{'gzip'}) {
317 my $doc_file = $self->{'gs_filename'};
318 `gzip $doc_file`;
319 $doc_file .= ".gz";
320 $short_doc_file .= ".gz";
321 if (!-e $doc_file) {
322 my $outhandle = $self->{'outhandle'};
323 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
324 return 0;
325 }
326 }
327
328 # store reference in the archive_info
329 if ($self->{'saveas'} eq "METS"){
330 $self->{'archive_info'}->add_info($OID, $short_mets_doc_file);
331 } else {
332 $self->{'archive_info'}->add_info($OID, $short_doc_file);
333 }
334
335 return 1;
336}
337
338sub output_xml_header {
339 my $self = shift (@_);
340 my ($handle) = @_;
341
342 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
343
344 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
345 print $handle "<Archive>\n";
346}
347
348sub output_xml_footer {
349 my $self = shift (@_);
350 my ($handle) = @_;
351
352 print $handle "</Archive>\n";
353}
354
355sub output_txt_xml_header{
356 my $self = shift (@_);
357 my ($handle) = @_;
358 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
359 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
360}
361
362sub output_txt_xml_footer{
363 my $self = shift(@_);
364 my ($handle) = @_;
365 print $handle "<the end of the file>\n";
366}
367
368sub output_mets_xml_header(){
369 my $self = shift(@_);
370 my ($handle, $OID) = @_;
371 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
373 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
374}
375
376sub output_mets_xml_footer() {
377 my $self = shift(@_);
378 my ($handle) = @_;
379 print $handle '</mets:mets>' . "\n";
380}
381
3821;
383
384
385
Note: See TracBrowser for help on using the repository browser.