source: trunk/gsdl/bin/script/import.pl@ 1287

Last change on this file since 1287 was 1287, checked in by sjboddie, 24 years ago

Implemented a -sortmeta option for import.pl to sort archives.inf file
(generated at end of import process) alphabetically by the given
metadata element. This may be useful for some collections as boolean
queries currently return matches in build (fairly random) order. Changing
the order of archives.inf changes the order that documents are built.
This option has a couple of important limitations:

  1. Can't be used in conjunction with the groupsize option as it would then only change the build order of groups of documents which doesn't seem very useful.
  2. Is of limited use when building indexes at a section level as the build order is only sorted by document, not by section.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
37}
38
39use strict;
40use arcinfo;
41use colcfg;
42use plugin;
43use docprint;
44use util;
45use parsargv;
46
47sub print_usage {
48 print STDERR "\n usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -importdir directory Where the original material lives\n";
52 print STDERR " -archivedir directory Where the converted material ends up\n";
53 print STDERR " -keepold Will not destroy the current contents of the\n";
54 print STDERR " archives directory (the default)\n";
55 print STDERR " -removeold Will remove the old contents of the archives\n";
56 print STDERR " directory -- use with care\n";
57 print STDERR " -gzip Use gzip to compress resulting gml documents\n";
58 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
59 print STDERR " list when building from compressed documents)\n";
60 print STDERR " -maxdocs number Maximum number of documents to import\n";
61 print STDERR " -groupsize number Number of GML documents to group into one file\n";
62 print STDERR " -sortmeta metadata Sort documents alphabetically by metadata for\n";
63 print STDERR " building. This will be disabled if groupsize > 1\n";
64 print STDERR " -debug Print imported text to STDOUT\n\n";
65}
66
67
68&main ();
69
70sub main {
71 my ($verbosity, $importdir, $archivedir, $keepold,
72 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
73 $configfilename, $collectcfg, $pluginfo, $sortmeta,
74 $archive_info_filename, $archive_info, $processor);
75 if (!parsargv::parse(\@ARGV,
76 'verbosity/\d+/2', \$verbosity,
77 'importdir/.*/', \$importdir,
78 'archivedir/.*/', \$archivedir,
79 'keepold', \$keepold,
80 'removeold', \$removeold,
81 'gzip', \$gzip,
82 'groupsize/\d+/1', \$groupsize,
83 'sortmeta/.*/', \$sortmeta,
84 'debug', \$debug,
85 'maxdocs/^\-?\d+/-1', \$maxdocs)) {
86 &print_usage();
87 die "\n";
88 }
89
90 # set removeold to false if it has been defined
91 $removeold = 0 if ($keepold);
92
93 # get and check the collection name
94 if (($collection = &util::use_collection(@ARGV)) eq "") {
95 &print_usage();
96 die "\n";
97 }
98
99 # check sortmeta
100 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
101 if (defined $sortmeta && $groupsize > 1) {
102 print STDERR "WARNING: import.pl cannot sort documents when groupsize > 1\n";
103 print STDERR " sortmeta option will be ignored\n\n";
104 $sortmeta = undef;
105 }
106
107 # dynamically load 'docsave' module so it can pick up on a collection
108 # specific docsave.pm is specified.
109
110 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
111 require docsave;
112
113
114 # get the list of plugins for this collection
115 my $plugins = [];
116 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc/collect.cfg");
117 if (-e $configfilename) {
118 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
119 if (defined $collectcfg->{'plugin'}) {
120 $plugins = $collectcfg->{'plugin'};
121 }
122 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
123 $importdir = $collectcfg->{'importdir'};
124 }
125 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
126 $archivedir = $collectcfg->{'archivedir'};
127 }
128 if (defined $collectcfg->{'removeold'}) {
129 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
130 $removeold = 1;
131 }
132 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
133 $removeold = 0;
134 }
135 }
136 } else {
137 die "Couldn't find the configuration file $configfilename\n";
138 }
139
140 # fill in the default import and archives directories if none
141 # were supplied, turn all \ into / and remove trailing /
142 $importdir = "$ENV{'GSDLCOLLECTDIR'}/import" if $importdir eq "";
143 $importdir =~ s/[\\\/]+/\//g;
144 $importdir =~ s/\/$//;
145 $archivedir = "$ENV{'GSDLCOLLECTDIR'}/archives" if $archivedir eq "";
146 $archivedir =~ s/[\\\/]+/\//g;
147 $archivedir =~ s/\/$//;
148
149 # load all the plugins
150 $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
151 if (scalar(@$pluginfo) == 0) {
152 print STDERR "No plugins were loaded.\n";
153 die "\n";
154 }
155
156 # remove the old contents of the archives directory if needed
157 if ($removeold && -e $archivedir) {
158 print STDERR "Warning - removing current contents of the archives directory\n";
159 print STDERR " in preparation for the import\n";
160 sleep(5); # just in case...
161 &util::rm_r ($archivedir);
162 }
163
164 # read the archive information file
165 if (!$debug) {
166 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
167 $archive_info = new arcinfo ();
168 $archive_info->load_info ($archive_info_filename);
169
170 # create a docsave object to process the documents
171 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize);
172 $processor->setarchivedir ($archivedir);
173 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
174 } else {
175 $processor = new docprint ();
176 }
177
178 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
179
180 # process the import directory
181 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
182
183 &plugin::end($pluginfo);
184
185 # write out the archive information file
186 if (!$debug) {
187 $processor->close_file_output() if $groupsize > 1;
188 $archive_info->save_info($archive_info_filename);
189 }
190}
191
192
193
194
195
Note: See TracBrowser for help on using the repository browser.