source: gsdl/trunk/bin/script/explode_metadata_database.pl@ 17037

Last change on this file since 17037 was 17037, checked in by kjdon, 16 years ago

made descriptions for new options come from dictionary, hide the collection option in gli

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use colcfg;
18
19use FileHandle;
20
21use File::Spec;
22use File::Basename;
23
24my $unicode_list =
25 [ { 'name' => "auto",
26 'desc' => "{BasPlug.input_encoding.auto}" },
27 { 'name' => "ascii",
28 'desc' => "{BasPlug.input_encoding.ascii}" },
29 { 'name' => "utf8",
30 'desc' => "{BasPlug.input_encoding.utf8}" },
31 { 'name' => "unicode",
32 'desc' => "{BasPlug.input_encoding.unicode}" } ];
33
34my $e = $encodings::encodings;
35foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
36{
37 my $hashEncode =
38 {'name' => $enc,
39 'desc' => $e->{$enc}->{'name'}};
40
41 push(@{$unicode_list},$hashEncode);
42}
43
44my $arguments =
45 [
46 { 'name' => "language",
47 'desc' => "{scripts.language}",
48 'type' => "string",
49 'reqd' => "no",
50 'hiddengli' => "yes" },
51 { 'name' => "plugin",
52 'desc' => "{explode.plugin}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "input_encoding",
57 'desc' => "{explode.encoding}",
58 'type' => "enum",
59 'deft' => "auto",
60 'list' => $unicode_list,
61 'reqd' => "no" },
62 { 'name' => "metadata_set",
63 'desc' => "{explode.metadata_set}",
64 'type' => "string",
65 'reqd' => "no" },
66 { 'name' => "document_field",
67 'desc' => "{explode.document_field}",
68 'type' => "string",
69 'reqd' => "no"},
70 { 'name' => "document_prefix",
71 'desc' => "{explode.document_prefix}",
72 'type' => "string",
73 'reqd' => "no"},
74 { 'name' => "document_suffix",
75 'desc' => "{explode.document_suffix}",
76 'type' => "string",
77 'reqd' => "no"},
78 { 'name' => "records_per_folder",
79 'desc' => "{explode.records_per_folder}",
80 'type' => "int",
81 'range' => "0,",
82 'deft' => "100",
83 'reqd' => "no" },
84 { 'name' => "plugin_options",
85 'desc' => "{explode.plugin_options}",
86 'type' => "string",
87 'reqd' => "no",
88 'modegli' => "3"},
89 { 'name' => "collection",
90 'desc' => "{explode.collection}",
91 'type' => "string",
92 'reqd' => "no",
93 'hiddengli' => "yes"},
94 { 'name' => "verbosity",
95 'desc' => "{import.verbosity}",
96 'type' => "int",
97 'range' => "0,",
98 'deft' => "1",
99 'reqd' => "no",
100 'modegli' => "4" },
101 { 'name' => "xml",
102 'desc' => "",
103 'type' => "flag",
104 'reqd' => "no",
105 'hiddengli' => "yes" }
106 ];
107
108my $options = { 'name' => "explode_metadata_database.pl",
109 'desc' => "{explode.desc}",
110 'args' => $arguments };
111
112
113
114sub main
115{
116 my ($language, $input_encoding, $metadata_set, $plugin,
117 $document_field, $document_prefix, $document_suffix, $records_per_folder, $plugin_options, $collection, $verbosity);
118
119 my $xml = 0;
120
121 my $hashParsingResult = {};
122 # parse the options
123 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
124
125 # If parse returns -1 then something has gone wrong
126 if ($intArgLeftinAfterParsing == -1)
127 {
128 &PrintUsage::print_txt_usage($options, "{explode.params}");
129 die "\n";
130 }
131
132 foreach my $strVariable (keys %$hashParsingResult)
133 {
134 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
135 }
136
137 # If $language has been specified, load the appropriate resource bundle
138 # (Otherwise, the default resource bundle will be loaded automatically)
139 if ($language && $language =~ /\S/) {
140 &gsprintf::load_language_specific_resource_bundle($language);
141 }
142
143 if ($xml) {
144 &PrintUsage::print_xml_usage($options);
145 print "\n";
146 return;
147 }
148
149
150 # There should one arg left after parsing (the filename)
151 # Or the user may have specified -h, in which case we output the usage
152 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
153 {
154 &PrintUsage::print_txt_usage($options, "{explode.params}");
155 die "\n";
156 }
157
158 # The metadata database filename is the first value that remains after the options have been parsed out
159 my $filename = $ARGV[0];
160 if (!defined $filename || $filename !~ /\w/) {
161 &PrintUsage::print_txt_usage($options, "{explode.params}");
162 print STDERR "You need to specify a filename";
163 die "\n";
164 }
165 # check that file exists
166 if (!-e $filename) {
167 print STDERR "File $filename doesn't exist...\n";
168 die "\n";
169 }
170 # check required options
171 if (!defined $plugin || $plugin !~ /\w/) {
172 &PrintUsage::print_txt_usage($options, "{explode.params}");
173 print STDERR "You need to specify a plugin";
174 die "\n";
175 }
176
177 # check metadata set
178 if (defined $metadata_set && $metadata_set =~ /\w/) {
179 $metadata_set .= ".";
180 } else {
181 $metadata_set = "";
182 }
183 if (defined $collection && $collection =~ /\w/) {
184 if (($collection = &colcfg::use_collection("", $collection, "")) eq "") {
185 print STDERR "Collection $collection does not exist\n";
186 die "\n";
187 }
188 }
189
190 my $plugobj;
191 require "$plugin.pm";
192
193 if (defined $plugin_options && $plugin_options =~ /\w/) {
194 my @options = split(/\s/, $plugin_options);
195 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
196 $plugin_options = join (",", @options);
197 eval ("\$plugobj = new $plugin([], [$plugin_options])");
198 die "$@" if $@;
199 } else {
200 eval ("\$plugobj = new $plugin()");
201 die "$@" if $@;
202 }
203 # ...and initialize it
204 $plugobj->init($verbosity, "STDERR", "STDERR");
205
206 if ($input_encoding eq "auto") {
207 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
208 }
209
210 # Create a directory to store the document files...
211 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
212
213 my $orig_base_dir = &File::Basename::dirname($filename);
214
215
216 my $split_exp = $plugobj->{'split_exp'};
217 if (defined $split_exp) {
218 # Read in file, and then split and process individual records
219
220 my $text = "";
221 # Use the plugin's read_file function to avoid duplicating code
222 $plugobj->read_file($filename, $input_encoding, undef, \$text);
223 # is there any text in the file??
224 die "\n" unless length($text);
225
226 # Split the text into records, using the plugin's split_exp
227
228 my @metadata_records = split(/$split_exp/, $text);
229 print STDERR "Number of records: " . scalar(@metadata_records) . "\n";
230
231 # Write the metadata from each record to the metadata.xml file
232 my $record_number = 1;
233 my $documents_directory;
234 foreach my $record_text (@metadata_records) {
235
236 # Check if we need to start a new directory for these records
237 check_need_new_directory($exploded_base_dir,$record_number,$records_per_folder,
238 \@metadata_records,\$documents_directory);
239 # Use the plugin's process function to avoid duplicating code
240 my $doc_obj = new doc($filename, "nonindexed_doc");
241 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
242
243
244 # Try to get a doc to attach the metadata to
245 # If no match found, create a dummy .nul file
246 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
247 $documents_directory, $orig_base_dir,
248 $document_prefix, $document_suffix, $metadata_set, $verbosity);
249
250
251 check_close_directory($record_number,$records_per_folder,\@metadata_records);
252
253 $record_number = $record_number + 1;
254 }
255 }
256 else {
257 # Call metadata_read to sets up associated metadata
258
259 my $pluginfo = undef;
260 my $metadata = {};
261
262 my $processor = undef;
263 my $maxdocs = undef;
264 my $gli = undef;
265
266 my $extrametakeys = {};
267 my $extrametadata = {};
268
269
270 $plugobj->metadata_read($pluginfo, "", $filename, $metadata,
271 $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
272
273
274 my $documents_directory = need_new_directory($exploded_base_dir);
275
276 # Attach metadata to object
277 # => use the plugin's extra_metadata function to avoid duplicating code
278 my $doc_obj = new doc($filename, "nonindexed_doc");
279
280 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
281
282 # Try to get a doc to attach the metadata to
283 # If no match found, create a dummy .nul file
284 attach_metadata_or_make_nul_doc($document_field, $doc_obj, undef,
285 $documents_directory, $orig_base_dir,
286 $document_prefix, $document_suffix, $metadata_set, $verbosity);
287
288
289 close_directory();
290 }
291
292
293 # Explode means just that: the original file is deleted
294 &util::rm($filename);
295 $plugobj->clean_up_after_exploding();
296
297}
298
299
300sub need_new_directory
301{
302 my ($exploded_base_dir) = @_;
303
304 my $documents_directory = $exploded_base_dir;
305
306 if (-d $documents_directory) {
307 die "Error: document directory $documents_directory already exists (bailing).\n";
308 }
309 &util::mk_dir($documents_directory);
310
311 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
312 if (-e $documents_metadata_xml_file) {
313 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
314 }
315
316 # Start the metadata.xml file
317 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
318 print METADATA_XML_FILE
319 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
320 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
321 "<DirectoryMetadata>\n";
322
323 return $documents_directory;
324}
325
326sub check_need_new_directory
327{
328 my ($exploded_base_dir,$record_number, $records_per_folder,$metadata_records,
329 $documents_dir_ref) = @_;
330
331
332 # Check if we need to start a new directory for these records
333 if (($record_number % $records_per_folder) == 1) {
334 my $documents_directory = $exploded_base_dir;
335
336 if (scalar(@$metadata_records) > $records_per_folder) {
337 $documents_directory .= "." . sprintf("%8.8d", $record_number);
338 }
339
340 $$documents_dir_ref = need_new_directory($documents_directory);
341 }
342}
343
344
345
346
347
348sub attach_metadata_or_make_nul_doc
349{
350 my ($document_field, $doc_obj, $record_number,
351 $documents_directory, $orig_base_dir,
352 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
353
354 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
355 my $document_file;
356
357 # try to get a doc to attach the metadata to
358 if (defined $document_field) {
359 foreach my $pair (@$record_metadata) {
360 my ($field, $value) = (@$pair);
361
362 $value =~ s/\\\\/\\/g;
363
364 # Does this metadata element specify a document to obtain?
365 if ($field eq $document_field) {
366 my $document_file_full = $document_prefix . $value . $document_suffix;
367
368 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
369 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
370 }
371 }
372 }
373
374 # Create a dummy .nul file if we haven't obtained any documents for this record
375 if (not defined $document_file) {
376
377 if (defined ($record_number)) {
378 $document_file = sprintf("%8.8d", $record_number) . ".nul";
379 }
380 else {
381 $document_file = "doc.nul";
382 }
383 open(DUMMY_FILE, ">$documents_directory/$document_file");
384 close(DUMMY_FILE);
385 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
386 }
387
388}
389
390sub close_directory
391{
392 # Finish and close the metadata.xml file
393 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
394 close(METADATA_XML_FILE);
395
396}
397
398
399sub check_close_directory
400{
401 my ($record_number,$records_per_folder,$metadata_records) = @_;
402
403 if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@$metadata_records)) {
404 # Finish and close the metadata.xml file
405 close_directory();
406 }
407}
408
409
410
411sub write_metadata_xml_file_entry
412{
413 my $metadata_xml_file = shift(@_);
414 my $file_name = shift(@_);
415 my $record_metadata = shift(@_);
416 my $meta_prefix = shift(@_);
417
418 # Make $file_name XML-safe
419 $file_name =~ s/&/&amp;/g;
420 $file_name =~ s/</&lt;/g;
421 $file_name =~ s/>/&gt;/g;
422
423 # Convert $file_name into a regular expression that matches it
424 $file_name =~ s/\./\\\./g;
425 $file_name =~ s/\(/\\\(/g;
426 $file_name =~ s/\)/\\\)/g;
427 $file_name =~ s/\{/\\\{/g;
428 $file_name =~ s/\}/\\\}/g;
429 $file_name =~ s/\[/\\\[/g;
430 $file_name =~ s/\]/\\\]/g;
431
432 print $metadata_xml_file
433 "\n" .
434 " <FileSet>\n" .
435 " <FileName>$file_name</FileName>\n" .
436 " <Description>\n";
437
438 foreach my $pair (@$record_metadata) {
439 my ($field, $value) = (@$pair);
440
441 # We're only interested in metadata from the database
442 next if ($field eq "lastmodified");
443 next if ($field eq "gsdlsourcefilename");
444 next if ($field eq "gsdldoctype");
445 next if ($field eq "FileFormat");
446
447 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
448 next if ($field =~ /\^all$/); # ISISPlug specific!
449
450 # Make $value XML-safe
451 $value =~ s/&/&amp;/g; # May mess up existing entities!
452 $value =~ s/</&lt;/g;
453 $value =~ s/>/&gt;/g;
454
455 # we are not allowed & in xml except in entities.
456 # if there are undefined entities then parsing will also crap out.
457 # should we be checking for them too?
458 # this may not get all possibilities
459 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
460
461 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$meta_prefix$field\">$value</Metadata>\n";
462 }
463
464 print $metadata_xml_file
465 " </Description>\n" .
466 " </FileSet>\n";
467}
468
469sub obtain_document
470{
471 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
472
473 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
474
475 my $document_file_name;
476 my $local_document_file;
477
478 # Document specified is on the web
479 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
480 $document_file_full =~ /([^\/]+)$/;
481 $document_file_name = $1;
482 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
483
484 my $wget_options = "--quiet";
485 $wget_options = "--verbose" if ($verbosity > 2);
486 $wget_options .= " --timestamping"; # Only re-download files if they're newer
487 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
488 `$wget_command`;
489
490 # Check the document was obtained successfully
491 if (!-e $local_document_file) {
492 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
493 }
494 }
495 # Document specified is on the disk
496 else {
497 my $dir_sep = &util::get_os_dirsep();
498
499 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
500 $document_file_name = $2;
501
502
503 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
504 print STDERR "doc file full = $document_file_full\n";
505
506 if (!$is_absolute) {
507 $document_file_full
508 = &util::filename_cat($orig_base_dir,$document_file_full);
509 }
510
511 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
512
513 &util::cp($document_file_full, $documents_directory);
514
515 # Check the document was obtained successfully
516 if (!-e $local_document_file) {
517 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
518 }
519 else {
520 if ($document_file_full =~ m/^$orig_base_dir.*/) {
521 # file local to metadata record
522 # => copy has been made successfully, so remove original
523 &util::rm($document_file_full);
524 }
525 }
526 }
527
528 # If the document wasn't obtained successfully, create a .nul file for it
529 if (!-e $local_document_file) {
530 $document_file_name .= ".nul";
531 open(NULL_FILE, ">$local_document_file.nul");
532 close(NULL_FILE);
533 }
534
535 return $document_file_name;
536}
537
538&main(@ARGV);
539
Note: See TracBrowser for help on using the repository browser.