source: trunk/gsdl/perllib/docsave.pm@ 2267

Last change on this file since 2267 was 2267, checked in by davidb, 23 years ago

GML file syntax altered to be XML compliant. This basically meant
turning attribute lists of metadata names (which in Greenstone can
appear multiple times within a tag) into tag names themselves, which
are then explicitly stated in a <metadata>...</metadata> block.

Newly built collection will use the new syntactic form, however the
GMLPlug file is backwards compatible and so will still import in
files in the older GML format.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
RevLine 
[537]1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[4]26# This document processor saves a document in the
27# archives directory of a collection
28
29
30package docsave;
31
32use arcinfo;
33use docproc;
34use util;
35
36
37sub BEGIN {
38 @ISA = ('docproc');
39}
40
41sub new {
[1424]42 my ($class, $collection, $archive_info, $verbosity,
43 $gzip, $groupsize, $outhandle) = @_;
[4]44 my $self = new docproc ();
[1424]45
46
[898]47 $groupsize=1 unless defined $groupsize;
[4]48 $self->{'collection'} = $collection;
49 $self->{'archive_info'} = $archive_info;
[170]50 $self->{'verbosity'} = $verbosity;
[433]51 $self->{'gzip'} = $gzip;
[898]52
[834]53 $self->{'groupsize'} = $groupsize;
54 $self->{'gs_count'} = 0;
[2267]55
56 # keep an associate array of all metavalues used by collection to
57 # help generate the XML DTD
58 $self->{'dtd_metadata'} = {};
59
[1424]60 $self->{'outhandle'} = STDERR;
61 $self->{'outhandle'} = $outhandle if defined $outhandle;
[134]62 # set a default for the archive directory
[1454]63 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
[1424]64
[1287]65 $self->{'sortmeta'} = undef;
[1424]66
[4]67 return bless $self, $class;
68}
69
[134]70sub setarchivedir {
71 my $self = shift (@_);
72 my ($archive_dir) = @_;
[1424]73
[2048]74 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
[134]75 $self->{'archive_dir'} = $archive_dir;
76}
77
[1287]78sub set_sortmeta {
79 my $self = shift (@_);
80 my ($sortmeta) = @_;
[1424]81
[1287]82 $self->{'sortmeta'} = $sortmeta;
83}
84
[4]85sub process {
86 my $self = shift (@_);
87 my ($doc_obj) = @_;
[1424]88
89 my $outhandle = $self->{'outhandle'};
90
[1287]91 if ($self->{'groupsize'} > 1) {
92 $self->group_process ($doc_obj);
[1424]93
[1287]94 } else {
95 # groupsize is 1 (i.e. one document per GML file) so sortmeta
96 # may be used
[1424]97
[1287]98 my $OID = $doc_obj->get_OID();
99 $OID = "NULL" unless defined $OID;
100
101 # get document's directory
102 my $doc_dir = $self->get_doc_dir ($OID);
[1424]103
[1287]104 # copy all the associated files, add this information as metadata
105 # to the document
106 $self->process_assoc_files ($doc_obj, $doc_dir);
[1424]107
[1287]108 my $doc_file
109 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
110 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
[1424]111
[1287]112 if (!open (OUTDOC, ">$doc_file")) {
[1424]113 print $outhandle "docsave::process could not write to file $doc_file\n";
[1287]114 return;
115 }
116
117 # save this document
[2267]118 $doc_obj->output_section('docsave::OUTDOC',
119 $doc_obj->get_top_section(),
120 $self->{'collection'},
121 $self->{'dtd_metadata'},0);
122
[1287]123 close OUTDOC;
124
125 if ($self->{'gzip'}) {
126 my $doc_file = $self->{'gs_filename'};
127 `gzip $doc_file`;
128 $doc_file .= ".gz";
129 $short_doc_file .= ".gz";
130 if (!-e $doc_file) {
[1424]131 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
[1287]132 return 0;
133 }
134 }
135
136 # do the sortmeta thing
137 my ($metadata);
138 if (defined ($self->{'sortmeta'})) {
139 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
140 }
141
142 # store reference in the archive_info
143 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
144 }
145}
146
147sub group_process {
148 my $self = shift (@_);
149 my ($doc_obj) = @_;
[1424]150
151 my $outhandle = $self->{'outhandle'};
[1287]152
[4]153 my $OID = $doc_obj->get_OID();
154 $OID = "NULL" unless defined $OID;
155
[834]156 my $groupsize = $self->{'groupsize'};
157 my $gs_count = $self->{'gs_count'};
158 my $open_new_file = (($gs_count % $groupsize)==0);
[98]159
[834]160 # opening a new file, or document has assoicated files => directory needed
[1287]161 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
162
163 # get document's directory
164 my $doc_dir = $self->get_doc_dir ($OID);
165
[834]166 # copy all the associated files, add this information as metadata
167 # to the document
[1287]168 $self->process_assoc_files ($doc_obj, $doc_dir);
[98]169
[1287]170
171 if ($open_new_file) {
[834]172 # only if opening new file
173 my $doc_file
[1287]174 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
[834]175 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
176
177 if ($gs_count>0)
178 {
179 return if (!$self->close_file_output());
180 }
181
182 if (!open (OUTDOC, ">$doc_file")) {
[1424]183 print $outhandle "docsave::group_process could not write to file $doc_file\n";
[834]184 return;
185 }
186 $self->{'gs_filename'} = $doc_file;
187 $self->{'gs_short_filename'} = $short_doc_file;
188 $self->{'gs_OID'} = $OID;
[4]189 }
190 }
191
192 # save this document
[2267]193 $doc_obj->output_section('docsave::OUTDOC',
194 $doc_obj->get_top_section(),
195 $self->{'collection'},
196 $self->{'dtd_metadata'},0);
[433]197
[834]198 $self->{'gs_count'}++;
199}
200
[1287]201
202sub get_doc_dir {
203 my $self = shift (@_);
204 my ($OID) = @_;
205
206 my $doc_info = $self->{'archive_info'}->get_info($OID);
207 my $doc_dir = "";
208 if (defined $doc_info && scalar(@$doc_info) >= 1) {
209 # this OID already has an assigned directory, use the
210 # same one.
211 $doc_dir = $doc_info->[0];
212 $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
213 } else {
214 # have to get a new document directory
215 my $doc_dir_rest = $OID;
216 my $doc_dir_num = 0;
217 do {
218 $doc_dir .= "/" if $doc_dir_num > 0;
219 if ($doc_dir_rest =~ s/^(.{1,8})//) {
220 $doc_dir .= $1;
221 $doc_dir_num++;
222 }
223 } while ($doc_dir_rest ne "" &&
224 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
225 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
226 $doc_dir .= ".dir";
227
228 }
229
230 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
231
232 return $doc_dir;
233}
234
235
236sub process_assoc_files {
237 my $self = shift (@_);
238 my ($doc_obj, $doc_dir) = @_;
239
[1424]240 my $outhandle = $self->{'outhandle'};
241
[1287]242 my @assoc_files = ();
243 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
244 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
245 $dir = "" unless defined $dir;
246 if (-e $assoc_file->[0]) {
247 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
248 &util::hard_link ($assoc_file->[0], $filepath);
249 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
250 "gsdlassocfile",
251 "$afile:$assoc_file->[2]:$dir");
[2224]252 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
253 "assocfilepath",
254 "$doc_dir");
[1844]255 } elsif ($self->{'verbosity'} > 2) {
[1424]256 print $outhandle "docsave::process couldn't copy the associated file " .
[1287]257 "$assoc_file->[0] to $afile\n";
258 }
259 }
260}
261
262
[834]263sub close_file_output
264{
265 my ($self) = @_;
[1424]266
[4]267 close OUTDOC;
268
[834]269 my $OID = $self->{'gs_OID'};
270 my $short_doc_file = $self->{'gs_short_filename'};
271
[433]272 if ($self->{'gzip'}) {
[834]273 my $doc_file = $self->{'gs_filename'};
[433]274 `gzip $doc_file`;
275 $doc_file .= ".gz";
276 $short_doc_file .= ".gz";
277 if (!-e $doc_file) {
[1424]278 my $outhandle = $self->{'outhandle'};
279 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
[834]280 return 0;
[433]281 }
282 }
283
[4]284 # store reference in the archive_info
[433]285 $self->{'archive_info'}->add_info($OID, $short_doc_file);
[834]286
287 return 1;
[4]288}
289
2901;
Note: See TracBrowser for help on using the repository browser.