source: trunk/gsdl/perllib/docsave.pm@ 2018

Last change on this file since 2018 was 1844, checked in by sjboddie, 23 years ago

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection
28
29
30package docsave;
31
32use arcinfo;
33use docproc;
34use util;
35
36
37sub BEGIN {
38 @ISA = ('docproc');
39}
40
41sub new {
42 my ($class, $collection, $archive_info, $verbosity,
43 $gzip, $groupsize, $outhandle) = @_;
44 my $self = new docproc ();
45
46
47 $groupsize=1 unless defined $groupsize;
48 $self->{'collection'} = $collection;
49 $self->{'archive_info'} = $archive_info;
50 $self->{'verbosity'} = $verbosity;
51 $self->{'gzip'} = $gzip;
52
53 $self->{'groupsize'} = $groupsize;
54 $self->{'gs_count'} = 0;
55
56 $self->{'outhandle'} = STDERR;
57 $self->{'outhandle'} = $outhandle if defined $outhandle;
58
59 # set a default for the archive directory
60 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
61
62 $self->{'sortmeta'} = undef;
63
64 return bless $self, $class;
65}
66
67sub setarchivedir {
68 my $self = shift (@_);
69 my ($archive_dir) = @_;
70
71 $self->{'archive_dir'} = $archive_dir;
72}
73
74sub set_sortmeta {
75 my $self = shift (@_);
76 my ($sortmeta) = @_;
77
78 $self->{'sortmeta'} = $sortmeta;
79}
80
81sub process {
82 my $self = shift (@_);
83 my ($doc_obj) = @_;
84
85 my $outhandle = $self->{'outhandle'};
86
87 if ($self->{'groupsize'} > 1) {
88 $self->group_process ($doc_obj);
89
90 } else {
91 # groupsize is 1 (i.e. one document per GML file) so sortmeta
92 # may be used
93
94 my $OID = $doc_obj->get_OID();
95 $OID = "NULL" unless defined $OID;
96
97 # get document's directory
98 my $doc_dir = $self->get_doc_dir ($OID);
99
100 # copy all the associated files, add this information as metadata
101 # to the document
102 $self->process_assoc_files ($doc_obj, $doc_dir);
103
104 my $doc_file
105 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
106 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
107
108 if (!open (OUTDOC, ">$doc_file")) {
109 print $outhandle "docsave::process could not write to file $doc_file\n";
110 return;
111 }
112
113 # save this document
114 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
115 close OUTDOC;
116
117 if ($self->{'gzip'}) {
118 my $doc_file = $self->{'gs_filename'};
119 `gzip $doc_file`;
120 $doc_file .= ".gz";
121 $short_doc_file .= ".gz";
122 if (!-e $doc_file) {
123 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
124 return 0;
125 }
126 }
127
128 # do the sortmeta thing
129 my ($metadata);
130 if (defined ($self->{'sortmeta'})) {
131 $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
132 }
133
134 # store reference in the archive_info
135 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
136 }
137}
138
139sub group_process {
140 my $self = shift (@_);
141 my ($doc_obj) = @_;
142
143 my $outhandle = $self->{'outhandle'};
144
145 my $OID = $doc_obj->get_OID();
146 $OID = "NULL" unless defined $OID;
147
148 my $groupsize = $self->{'groupsize'};
149 my $gs_count = $self->{'gs_count'};
150 my $open_new_file = (($gs_count % $groupsize)==0);
151
152 # opening a new file, or document has assoicated files => directory needed
153 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
154
155 # get document's directory
156 my $doc_dir = $self->get_doc_dir ($OID);
157
158 # copy all the associated files, add this information as metadata
159 # to the document
160 $self->process_assoc_files ($doc_obj, $doc_dir);
161
162
163 if ($open_new_file) {
164 # only if opening new file
165 my $doc_file
166 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
167 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
168
169 if ($gs_count>0)
170 {
171 return if (!$self->close_file_output());
172 }
173
174 if (!open (OUTDOC, ">$doc_file")) {
175 print $outhandle "docsave::group_process could not write to file $doc_file\n";
176 return;
177 }
178 $self->{'gs_filename'} = $doc_file;
179 $self->{'gs_short_filename'} = $short_doc_file;
180 $self->{'gs_OID'} = $OID;
181 }
182 }
183
184 # save this document
185 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
186
187 $self->{'gs_count'}++;
188}
189
190
191sub get_doc_dir {
192 my $self = shift (@_);
193 my ($OID) = @_;
194
195 my $doc_info = $self->{'archive_info'}->get_info($OID);
196 my $doc_dir = "";
197 if (defined $doc_info && scalar(@$doc_info) >= 1) {
198 # this OID already has an assigned directory, use the
199 # same one.
200 $doc_dir = $doc_info->[0];
201 $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
202 } else {
203 # have to get a new document directory
204 my $doc_dir_rest = $OID;
205 my $doc_dir_num = 0;
206 do {
207 $doc_dir .= "/" if $doc_dir_num > 0;
208 if ($doc_dir_rest =~ s/^(.{1,8})//) {
209 $doc_dir .= $1;
210 $doc_dir_num++;
211 }
212 } while ($doc_dir_rest ne "" &&
213 ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
214 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
215 $doc_dir .= ".dir";
216
217 }
218
219 &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
220
221 return $doc_dir;
222}
223
224
225sub process_assoc_files {
226 my $self = shift (@_);
227 my ($doc_obj, $doc_dir) = @_;
228
229 my $outhandle = $self->{'outhandle'};
230
231 my @assoc_files = ();
232 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
233 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
234 $dir = "" unless defined $dir;
235 if (-e $assoc_file->[0]) {
236 my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
237 &util::hard_link ($assoc_file->[0], $filepath);
238 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
239 "gsdlassocfile",
240 "$afile:$assoc_file->[2]:$dir");
241 } elsif ($self->{'verbosity'} > 2) {
242 print $outhandle "docsave::process couldn't copy the associated file " .
243 "$assoc_file->[0] to $afile\n";
244 }
245 }
246}
247
248
249sub close_file_output
250{
251 my ($self) = @_;
252
253 close OUTDOC;
254
255 my $OID = $self->{'gs_OID'};
256 my $short_doc_file = $self->{'gs_short_filename'};
257
258 if ($self->{'gzip'}) {
259 my $doc_file = $self->{'gs_filename'};
260 `gzip $doc_file`;
261 $doc_file .= ".gz";
262 $short_doc_file .= ".gz";
263 if (!-e $doc_file) {
264 my $outhandle = $self->{'outhandle'};
265 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
266 return 0;
267 }
268 }
269
270 # store reference in the archive_info
271 $self->{'archive_info'}->add_info($OID, $short_doc_file);
272
273 return 1;
274}
275
2761;
Note: See TracBrowser for help on using the repository browser.