source: trunk/gsdl/bin/script/import.pl@ 2287

Last change on this file since 2287 was 2287, checked in by sjboddie, 23 years ago

Reverted import.pl to version 1.25 as the last commit wiped out some of
the more recent changes. There didn't appear to be any new changes in the
now defunct version 1.26 so I'm guessing it was just a mistake. My sincere
apologies to David if I've screwed something up (but he started it ;-)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use arcinfo;
42use colcfg;
43use plugin;
44use docprint;
45use util;
46use parsargv;
47use FileHandle;
48
49sub print_usage {
50 print STDERR "\n";
51 print STDERR "import.pl: Converts documents in collections -importdir directory into\n";
52 print STDERR " gml documents which are written to the -archivedir directory.\n\n";
53 print STDERR " usage: $0 [options] collection-name\n\n";
54 print STDERR " options:\n";
55 print STDERR " -verbosity number 0=none, 3=lots\n";
56 print STDERR " -importdir directory Where the original material lives\n";
57 print STDERR " -archivedir directory Where the converted material ends up\n";
58 print STDERR " -keepold Will not destroy the current contents of the\n";
59 print STDERR " archives directory (the default)\n";
60 print STDERR " -removeold Will remove the old contents of the archives\n";
61 print STDERR " directory -- use with care\n";
62 print STDERR " -gzip Use gzip to compress resulting gml documents\n";
63 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
64 print STDERR " list when building from compressed documents)\n";
65 print STDERR " -maxdocs number Maximum number of documents to import\n";
66 print STDERR " -groupsize number Number of GML documents to group into one file\n";
67 print STDERR " -sortmeta metadata Sort documents alphabetically by metadata for\n";
68 print STDERR " building. This will be disabled if groupsize > 1\n";
69 print STDERR " -debug Print imported text to STDOUT\n";
70 print STDERR " -collectdir directory Collection directory (defaults to " .
71 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
72 print STDERR " -out Filename or handle to print output status to.\n";
73 print STDERR " The default is STDERR\n\n";
74}
75
76&main();
77
78sub main {
79 my ($verbosity, $importdir, $archivedir, $keepold,
80 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
81 $configfilename, $collectcfg, $pluginfo, $sortmeta,
82 $archive_info_filename, $archive_info, $processor,
83 $out, $collectdir);
84 if (!parsargv::parse(\@ARGV,
85 'verbosity/\d+/2', \$verbosity,
86 'importdir/.*/', \$importdir,
87 'archivedir/.*/', \$archivedir,
88 'keepold', \$keepold,
89 'removeold', \$removeold,
90 'gzip', \$gzip,
91 'groupsize/\d+/1', \$groupsize,
92 'sortmeta/.*/', \$sortmeta,
93 'debug', \$debug,
94 'maxdocs/^\-?\d+/-1', \$maxdocs,
95 'collectdir/.*/', \$collectdir,
96 'out/.*/STDERR', \$out)) {
97 &print_usage();
98 die "\n";
99 }
100
101 my $close_out = 0;
102 if ($out !~ /^(STDERR|STDOUT)$/i) {
103 open (OUT, ">$out") || die "Couldn't open output file $out\n";
104 $out = 'import::OUT';
105 $close_out = 1;
106 }
107 $out->autoflush(1);
108
109 # set removeold to false if it has been defined
110 $removeold = 0 if ($keepold);
111
112 # get and check the collection name
113 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
114 &print_usage();
115 die "\n";
116 }
117
118 # check sortmeta
119 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
120 if (defined $sortmeta && $groupsize > 1) {
121 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
122 print $out " sortmeta option will be ignored\n\n";
123 $sortmeta = undef;
124 }
125
126 # dynamically load 'docsave' module so it can pick up on a collection
127 # specific docsave.pm is specified.
128
129 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
130 require docsave;
131
132
133 # get the list of plugins for this collection
134 my $plugins = [];
135 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
136 if (-e $configfilename) {
137 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
138 if (defined $collectcfg->{'plugin'}) {
139 $plugins = $collectcfg->{'plugin'};
140 }
141 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
142 $importdir = $collectcfg->{'importdir'};
143 }
144 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
145 $archivedir = $collectcfg->{'archivedir'};
146 }
147 if (defined $collectcfg->{'removeold'}) {
148 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
149 $removeold = 1;
150 }
151 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
152 $removeold = 0;
153 }
154 }
155 } else {
156 die "Couldn't find the configuration file $configfilename\n";
157 }
158
159 # fill in the default import and archives directories if none
160 # were supplied, turn all \ into / and remove trailing /
161 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
162 $importdir =~ s/[\\\/]+/\//g;
163 $importdir =~ s/\/$//;
164 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
165 $archivedir =~ s/[\\\/]+/\//g;
166 $archivedir =~ s/\/$//;
167
168 # load all the plugins
169 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
170 if (scalar(@$pluginfo) == 0) {
171 print $out "No plugins were loaded.\n";
172 die "\n";
173 }
174
175 # remove the old contents of the archives directory if needed
176 if ($removeold && -e $archivedir) {
177 print $out "Warning - removing current contents of the archives directory\n";
178 print $out " in preparation for the import\n";
179 sleep(5); # just in case...
180 &util::rm_r ($archivedir);
181 }
182
183 # read the archive information file
184 if (!$debug) {
185 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
186 $archive_info = new arcinfo ();
187 $archive_info->load_info ($archive_info_filename);
188
189 # create a docsave object to process the documents
190 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
191 $processor->setarchivedir ($archivedir);
192 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
193 } else {
194 $processor = new docprint ();
195 }
196
197 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
198
199 # process the import directory
200 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
201
202 &plugin::end($pluginfo, $processor);
203
204 # write out the archive information file
205 if (!$debug) {
206 $processor->close_file_output() if $groupsize > 1;
207 $archive_info->save_info($archive_info_filename);
208 }
209 close OUT if $close_out;
210}
Note: See TracBrowser for help on using the repository browser.