source: main/trunk/greenstone2/bin/script/g2f-import.pl@ 26324

Last change on this file since 26324 was 26324, checked in by ak19, 12 years ago
  1. When purging documents from the fedora repository during import and buildcol, these documents are also removed from the Fedora GSearch index. Likewise, when documents are ingested into the Fedora repository during buildcol, these documents are also indexed by Fedora GSearch. 2. Added methods in g2futil.pm to be able to do this
  • Property svn:executable set to *
File size: 8.9 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN
4{
5 if (!defined $ENV{'GSDLHOME'}) {
6 print STDERR "Environment variable GSDLHOME not set.\n";
7 print STDERR " Have you sourced Greenstone's 'setup.bash' file?\n";
8 exit 1;
9 }
10
11 $ENV{'FEDORA_HOSTNAME'} = "localhost" if (!defined $ENV{'FEDORA_HOSTNAME'});
12 $ENV{'FEDORA_SERVER_PORT'} = "8080" if (!defined $ENV{'FEDORA_SERVER_PORT'});
13 $ENV{'FEDORA_USER'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_USER'});
14 $ENV{'FEDORA_PASS'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_PASS'});
15 $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
16 $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
17 $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
18
19 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
20}
21
22
23use strict;
24no strict 'refs'; # allow filehandles to be variables and vice versa
25no strict 'subs'; # allow barewords (e.g. STDERR) as function arguments
26
27use util;
28use gsprintf 'gsprintf';
29use printusage;
30use parse2;
31
32
33use g2futil;
34
35
36my $arguments =
37 [
38 { 'name' => "verbosity",
39 'desc' => "{import.verbosity}",
40 'type' => "string",
41 'deft' => '1',
42 'reqd' => "no",
43 'hiddengli' => "no" },
44 { 'name' => "hostname",
45 'desc' => "Domain hostname of Fedora server",
46 'type' => "string",
47 'deft' => $ENV{'FEDORA_HOSTNAME'},
48 'reqd' => "no",
49 'hiddengli' => "no" },
50 { 'name' => "port",
51 'desc' => "Port that the Fedora server is running on.",
52 'type' => "string",
53 'deft' => $ENV{'FEDORA_SERVER_PORT'},
54 'reqd' => "no",
55 'hiddengli' => "no" },
56 { 'name' => "username",
57 'desc' => "Fedora admin username",
58 'type' => "string",
59 'deft' => $ENV{'FEDORA_USER'},
60 'reqd' => "no",
61 'hiddengli' => "no" },
62 { 'name' => "password",
63 'desc' => "Fedora admin password",
64 'type' => "string",
65 'deft' => $ENV{'FEDORA_PASS'},
66 'reqd' => "no",
67 'hiddengli' => "no" },
68 { 'name' => "protocol",
69 'desc' => "Fedora protocol, e.g. 'http' or 'https'",
70 'type' => "string",
71 'deft' => $ENV{'FEDORA_PROTOCOL'},
72 'reqd' => "no",
73 'hiddengli' => "no" },
74 { 'name' => "pidnamespace",
75 'desc' => "Fedora prefix for PIDs",
76 'type' => "string",
77 'deft' => $ENV{'FEDORA_PID_NAMESPACE'},
78 'reqd' => "no",
79 'hiddengli' => "no" },
80 { 'name' => "maxdocs",
81 'desc' => "{import.maxdocs}",
82 'type' => "int",
83 'reqd' => "no",
84 'range' => "1,",
85 'modegli' => "1" },
86 { 'name' => "gli",
87 'desc' => "",
88 'type' => "flag",
89 'reqd' => "no",
90 'hiddengli' => "yes" },
91 { 'name' => "xml",
92 'desc' => "{scripts.xml}",
93 'type' => "flag",
94 'reqd' => "no",
95 'hiddengli' => "yes" },
96 { 'name' => "removeold",
97 'desc' => "{import.removeold}",
98 'type' => "flag",
99 'reqd' => "no",
100 'modegli' => "3" },
101 { 'name' => "language",
102 'desc' => "{scripts.language}",
103 'type' => "string",
104 'reqd' => "no",
105 'modegli' => "3" },
106 { 'name' => "collectdir",
107 'desc' => "{import.collectdir}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no",
111 'hiddengli' => "yes" },
112
113
114 ];
115
116my $prog_options
117 = { 'name' => "g2fimport.pl",
118 'desc' => "Generate Fedora METS format using Greenstone document processing capabilities",
119 'args' => $arguments };
120
121
122sub main
123{
124 my (@ARGV) = @_;
125
126 my $GSDLHOME = $ENV{'GSDLHOME'};
127
128
129 my $options = {};
130 # general options available to all plugins
131 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$options,"allow_extra_options");
132
133 # Something went wrong with parsing
134 if ($intArgLeftinAfterParsing ==-1)
135 {
136 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
137 die "\n";
138 }
139
140 my $xml = $options->{'xml'};
141 my $gli = $options->{'gli'};
142
143 if ($intArgLeftinAfterParsing != 1)
144 {
145 if ($xml) {
146 &PrintUsage::print_xml_usage($prog_options);
147 print "\n";
148 return;
149 }
150 else {
151 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
152 print "\n";
153 return;
154 }
155
156 }
157
158 if ($gli) { # the gli wants strings to be in UTF-8
159 &gsprintf::output_strings_in_UTF8;
160 }
161
162 my $gs_col = $ARGV[0];
163
164 my $verbosity = $options->{'verbosity'};
165 my $hostname = $options->{'hostname'};
166 my $port = $options->{'port'};
167 my $username = $options->{'username'};
168 my $password = $options->{'password'};
169 my $protocol = $options->{'protocol'};
170 my $pid_namespace = $options->{'pidnamespace'};
171
172 # The following are needed in the FedoraMETS plugout
173 $ENV{'FEDORA_HOSTNAME'} = $hostname;
174 $ENV{'FEDORA_SERVER_PORT'} = $port;
175
176 my $language = $options->{'language'};
177 my $collectdir = $options->{'collectdir'};
178 my $removeold = $options->{'removeold'};
179 my $maxdocs = $options->{'maxdocs'};
180
181 if (!$collectdir) {
182 # Explicitly set one, depending on whether it's GS2 or GS3
183 if($ENV{'GSDL3HOME'}) {
184 $collectdir = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite","collect");
185 } else {
186 $collectdir = &util::filename_cat($ENV{'GSDLHOME'},"collect");
187 }
188 }
189
190 # if GS3, and if Fedora uses Greenstone 3's tomcat, then we do not need to write out the file gsdl.xml into Fedora's tomcat
191 my $localfedora = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "fedora.xml");
192 unless($ENV{'GSDL3SRCHOME'} && -e $localfedora) {
193 # method that will tell Fedora where the ultimate output of g2f-import and g2f-build
194 # will be found: Points Fedora to the collect directory where FedoraMETS will be output.
195 &g2futil::write_gsdl_xml_file($hostname, $collectdir, $options);
196 }
197
198 my $full_gs_col = &util::filename_cat($collectdir,$gs_col);
199
200 if (!-e $full_gs_col ) {
201 print STDERR "Unable to find Greenstone collection $full_gs_col\n";
202 exit 1;
203 }
204
205 my $export_dir = &util::filename_cat($full_gs_col,"export");
206
207 if ( -e $export_dir ) {
208 print "***\n";
209 print "* Updating existing Greenstone $gs_col objects from Fedora $pid_namespace\n";
210 print "***\n";
211
212 # set up fedoragsearch for updating the index upon ingesting documents
213 my $fedoragsearch_webapp = &g2futil::gsearch_webapp_folder();
214
215 # need the username and password preset in order to run fedoraGSearch's RESTClient script
216 # this assumes that the fedoragsearch authentication details are the same as for fedora
217 if (defined $fedoragsearch_webapp) {
218 $ENV{'fgsUserName'} = $options->{'username'};
219 $ENV{'fgsPassword'} = $options->{'password'};
220 }
221
222 # readdir
223 if (opendir(DIR, $export_dir)) {
224 my @xml_files = grep { $_ =~ m/^greenstone-http.*\.xml$/ } readdir(DIR);
225 closedir DIR;
226
227 # purge all the (URL,hashID) metadata files that we inserted
228 # into fedora at the end of g2f-buildcol.pl
229 # convert the filenames into fedora-pids
230 # filename = greenstone-http=tmpcol-http-__test1-html.xml -> fpid = greenstone-http:tmpcol-http-__test1.html
231 foreach my $file (@xml_files) {
232 my $fedora_pid = $file;
233 $fedora_pid =~ s/\.xml$//;
234 $fedora_pid =~ s/\=/:/;
235 $fedora_pid =~ s/(.*)-(.*)$/$1.$2/;
236
237 print STDERR "#### fedora_pid: $fedora_pid\n";
238 &g2futil::run_purge($fedora_pid,$options); # displays error message if first time (nothing to purge)
239 }
240
241 my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir,$maxdocs);
242
243 # for each hash dir, purge its respective PID
244 foreach my $full_hd (@hash_dirs) {
245
246 my $hash_id = &g2futil::get_hash_id($full_hd);
247
248 next if !defined $hash_id;
249
250 my $pid = "$pid_namespace:$gs_col-$hash_id";
251
252 my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
253
254 if ($dsinfo_status == 0) {
255 # first remove the doc from the gsearch index before removing it from the fedora repository
256 print " deleting $pid from GSearch index\n";
257 &g2futil::run_delete_from_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
258
259 print " $pid being updated.\n";
260 &g2futil::run_purge($pid,$options);
261 }
262 else {
263 print " $pid not present.\n";
264 }
265 }
266 }
267 else {
268 print STDERR "Error: Unable to open directory $export_dir: $!\n";
269 exit;
270 }
271 }
272
273 print "***\n";
274 print "* Processing $gs_col into FedoraMETS format\n";
275 print "***\n";
276
277 my $gs_export_opts = "-saveas FedoraMETS -fedora_namespace $pid_namespace";
278
279 my $gs_opts = " -verbosity $verbosity";
280 $gs_opts .= " -gli" if ($gli);
281
282 $gs_opts .= " -language $language" if ($language);
283 $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
284 $gs_opts .= " -removeold" if ($removeold);
285 $gs_opts .= " -maxdocs $maxdocs" if ($maxdocs);
286
287 $gs_export_opts .= " $gs_opts -exportdir \"$export_dir\"";
288
289 my $gs_export_arguments = "$gs_export_opts $gs_col";
290
291 &g2futil::run_cmd("export.pl", $gs_export_arguments, $options);
292
293 print STDERR "**** Just for now, also run Greenstone's import.pl\n";
294# if we have the FedoraMETSPlugIN then we wouldn't have to run import anymore
295 my $gs_import_arguments = "$gs_opts $gs_col";
296
297 &g2futil::run_cmd("import.pl", $gs_import_arguments, $options);
298}
299
300&main(@ARGV);
301
302
303
Note: See TracBrowser for help on using the repository browser.