source: trunk/gsdl/bin/script/import.pl@ 2268

Last change on this file since 2268 was 2268, checked in by davidb, 23 years ago

Augmented so GML files generated conform to XML syntax. Main addition
at the top level is to generate a DTD for the collection once all
the files have been read in.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use arcinfo;
42use colcfg;
43use plugin;
44use docprint;
45use util;
46use parsargv;
47use FileHandle;
48
49sub print_usage {
50 print STDERR "\n usage: $0 [options] collection-name\n\n";
51 print STDERR " options:\n";
52 print STDERR " -verbosity number 0=none, 3=lots\n";
53 print STDERR " -importdir directory Where the original material lives\n";
54 print STDERR " -archivedir directory Where the converted material ends up\n";
55 print STDERR " -keepold Will not destroy the current contents of the\n";
56 print STDERR " archives directory (the default)\n";
57 print STDERR " -removeold Will remove the old contents of the archives\n";
58 print STDERR " directory -- use with care\n";
59 print STDERR " -gzip Use gzip to compress resulting gml documents\n";
60 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
61 print STDERR " list when building from compressed documents)\n";
62 print STDERR " -maxdocs number Maximum number of documents to import\n";
63 print STDERR " -groupsize number Number of GML documents to group into one file\n";
64 print STDERR " -sortmeta metadata Sort documents alphabetically by metadata for\n";
65 print STDERR " building. This will be disabled if groupsize > 1\n";
66 print STDERR " -debug Print imported text to STDOUT\n";
67 print STDERR " -out Filename or handle to print output status to.\n";
68 print STDERR " The default is STDERR\n\n";
69}
70
71&main();
72
73sub main {
74 my ($verbosity, $importdir, $archivedir, $keepold,
75 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
76 $configfilename, $collectcfg, $pluginfo, $sortmeta,
77 $archive_info_filename, $archive_info, $processor, $out);
78 if (!parsargv::parse(\@ARGV,
79 'verbosity/\d+/2', \$verbosity,
80 'importdir/.*/', \$importdir,
81 'archivedir/.*/', \$archivedir,
82 'keepold', \$keepold,
83 'removeold', \$removeold,
84 'gzip', \$gzip,
85 'groupsize/\d+/1', \$groupsize,
86 'sortmeta/.*/', \$sortmeta,
87 'debug', \$debug,
88 'maxdocs/^\-?\d+/-1', \$maxdocs,
89 'out/.*/STDERR', \$out)) {
90 &print_usage();
91 die "\n";
92 }
93
94 my $close_out = 0;
95 if ($out !~ /^(STDERR|STDOUT)$/i) {
96 open (OUT, ">$out") || die "Couldn't open output file $out\n";
97 $out = 'import::OUT';
98 $close_out = 1;
99 }
100 $out->autoflush(1);
101
102 # set removeold to false if it has been defined
103 $removeold = 0 if ($keepold);
104
105 # get and check the collection name
106 if (($collection = &util::use_collection(@ARGV)) eq "") {
107 &print_usage();
108 die "\n";
109 }
110
111 # check sortmeta
112 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
113 if (defined $sortmeta && $groupsize > 1) {
114 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
115 print $out " sortmeta option will be ignored\n\n";
116 $sortmeta = undef;
117 }
118
119 # dynamically load 'docsave' module so it can pick up on a collection
120 # specific docsave.pm is specified.
121
122 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
123 require docsave;
124
125
126 # get the list of plugins for this collection
127 my $plugins = [];
128 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc/collect.cfg");
129 if (-e $configfilename) {
130 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
131 if (defined $collectcfg->{'plugin'}) {
132 $plugins = $collectcfg->{'plugin'};
133 }
134 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
135 $importdir = $collectcfg->{'importdir'};
136 }
137 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
138 $archivedir = $collectcfg->{'archivedir'};
139 }
140 if (defined $collectcfg->{'removeold'}) {
141 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
142 $removeold = 1;
143 }
144 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
145 $removeold = 0;
146 }
147 }
148 } else {
149 die "Couldn't find the configuration file $configfilename\n";
150 }
151
152 # fill in the default import and archives directories if none
153 # were supplied, turn all \ into / and remove trailing /
154 $importdir = "$ENV{'GSDLCOLLECTDIR'}/import" if $importdir eq "";
155 $importdir =~ s/[\\\/]+/\//g;
156 $importdir =~ s/\/$//;
157 $archivedir = "$ENV{'GSDLCOLLECTDIR'}/archives" if $archivedir eq "";
158 $archivedir =~ s/[\\\/]+/\//g;
159 $archivedir =~ s/\/$//;
160
161 # load all the plugins
162 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
163 if (scalar(@$pluginfo) == 0) {
164 print $out "No plugins were loaded.\n";
165 die "\n";
166 }
167
168 # remove the old contents of the archives directory if needed
169 if ($removeold && -e $archivedir) {
170 print $out "Warning - removing current contents of the archives directory\n";
171 print $out " in preparation for the import\n";
172 sleep(5); # just in case...
173 &util::rm_r ($archivedir);
174 }
175
176 # read the archive information file
177 if (!$debug) {
178 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
179 $archive_info = new arcinfo ();
180 $archive_info->load_info ($archive_info_filename);
181
182 # create a docsave object to process the documents
183 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
184 $processor->setarchivedir ($archivedir);
185 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
186 } else {
187 $processor = new docprint ();
188 }
189
190 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
191
192 # process the import directory
193 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
194
195 &plugin::end($pluginfo,$processor);
196
197 # write out the archive information file
198 if (!$debug) {
199 $processor->close_file_output() if $groupsize > 1;
200 $archive_info->save_info($archive_info_filename);
201 }
202 close OUT if $close_out;
203}
Note: See TracBrowser for help on using the repository browser.