source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 36777

Last change on this file since 36777 was 36777, checked in by davidb, 14 months ago

Changes made to align with the needs of the Eurovision SPARQL prepare 02 sript

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
9}
10
11use strict;
12no strict 'subs'; # allow barewords (eg STDERR) as function arguments
13no strict 'refs'; # allow filehandles to be variables and vice versa
14
15use encodings;
16use extrametautil;
17use util;
18use FileUtils;
19use printusage;
20use parse2;
21use colcfg;
22
23use FileHandle;
24
25use File::Spec;
26use File::Basename;
27
28my $unicode_list =
29 [ { 'name' => "auto",
30 'desc' => "{ReadTextFile.input_encoding.auto}" },
31 { 'name' => "ascii",
32 'desc' => "{BasePlugin.encoding.ascii}" },
33 { 'name' => "utf8",
34 'desc' => "{BasePlugin.encoding.utf8}" },
35 { 'name' => "unicode",
36 'desc' => "{BasePlugin.encoding.unicode}" } ];
37
38my $e = $encodings::encodings;
39foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
40{
41 my $hashEncode =
42 {'name' => $enc,
43 'desc' => $e->{$enc}->{'name'}};
44
45 push(@{$unicode_list},$hashEncode);
46}
47
48my $arguments =
49 [
50 { 'name' => "language",
51 'desc' => "{scripts.language}",
52 'type' => "string",
53 'reqd' => "no",
54 'hiddengli' => "yes" },
55 { 'name' => "plugin",
56 'desc' => "{explode.plugin}",
57 'type' => "string",
58 'reqd' => "yes",
59 'hiddengli' => "yes"},
60 { 'name' => "input_encoding",
61 'desc' => "{explode.encoding}",
62 'type' => "enum",
63 'deft' => "auto",
64 'list' => $unicode_list,
65 'reqd' => "no" },
66 { 'name' => "metadata_set",
67 'desc' => "{explode.metadata_set}",
68 'type' => "string",
69 'reqd' => "no" },
70 { 'name' => "document_field",
71 'desc' => "{explode.document_field}",
72 'type' => "string",
73 'reqd' => "no"},
74 { 'name' => "document_prefix",
75 'desc' => "{explode.document_prefix}",
76 'type' => "string",
77 'reqd' => "no"},
78 { 'name' => "document_suffix",
79 'desc' => "{explode.document_suffix}",
80 'type' => "string",
81 'reqd' => "no"},
82 { 'name' => "records_per_folder",
83 'desc' => "{explode.records_per_folder}",
84 'type' => "int",
85 'range' => "0,",
86 'deft' => "100",
87 'reqd' => "no" },
88 { 'name' => "collectdir",
89 'desc' => "{import.collectdir}",
90 'type' => "string",
91 # parsearg left "" as default
92 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
93 'deft' => "",
94 'reqd' => "no",
95 'hiddengli' => "yes" },
96 { 'name' => "site",
97 'desc' => "{import.site}",
98 'type' => "string",
99 'deft' => "",
100 'reqd' => "no",
101 'hiddengli' => "yes" },
102 { 'name' => "collection",
103 'desc' => "{explode.collection}",
104 'type' => "string",
105 'reqd' => "no",
106 'hiddengli' => "yes"},
107 { 'name' => "use_collection_plugin_options",
108 'desc' => "{explode.use_collection_plugin_options}",
109 'type' => "flag",
110 'reqd' => "no",
111 'hiddengli' => "yes"},
112 { 'name' => "plugin_options",
113 'desc' => "{explode.plugin_options}",
114 'type' => "string",
115 'reqd' => "no",
116 'hiddengli' => "yes"},
117 { 'name' => "verbosity",
118 'desc' => "{import.verbosity}",
119 'type' => "int",
120 'range' => "0,",
121 'deft' => "1",
122 'reqd' => "no",
123 'modegli' => "3" },
124 { 'name' => "xml",
125 'desc' => "",
126 'type' => "flag",
127 'reqd' => "no",
128 'hiddengli' => "yes" }
129 ];
130
131my $options = { 'name' => "explode_metadata_database.pl",
132 'desc' => "{explode.desc}",
133 'args' => $arguments };
134
135
136
137sub main
138{
139 my ($language, $input_encoding, $metadata_set, $plugin,
140 $document_field, $document_prefix, $document_suffix,
141 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
142 $use_collection_plugin_options, $verbosity);
143
144 my $xml = 0;
145
146 my $hashParsingResult = {};
147 # parse the options
148 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
149
150 # If parse returns -1 then something has gone wrong
151 if ($intArgLeftinAfterParsing == -1)
152 {
153 &PrintUsage::print_txt_usage($options, "{explode.params}");
154 die "\n";
155 }
156
157 foreach my $strVariable (keys %$hashParsingResult)
158 {
159 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
160 }
161
162 # If $language has been specified, load the appropriate resource bundle
163 # (Otherwise, the default resource bundle will be loaded automatically)
164 if ($language && $language =~ /\S/) {
165 &gsprintf::load_language_specific_resource_bundle($language);
166 }
167
168 if ($xml) {
169 &PrintUsage::print_xml_usage($options);
170 print "\n";
171 return;
172 }
173
174
175 # There should one arg left after parsing (the filename)
176 # Or the user may have specified -h, in which case we output the usage
177 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
178 {
179 &PrintUsage::print_txt_usage($options, "{explode.params}");
180 die "\n";
181 }
182
183 # The metadata database filename is the first value that remains after the options have been parsed out
184 my $filename = $ARGV[0];
185 if (!defined $filename || $filename !~ /\w/) {
186 &PrintUsage::print_txt_usage($options, "{explode.params}");
187 print STDERR "You need to specify a filename";
188 die "\n";
189 }
190 # check that file exists
191 if (!-e $filename) {
192 print STDERR "File $filename doesn't exist...\n";
193 die "\n";
194 }
195 # check required options
196 if (!defined $plugin || $plugin !~ /\w/) {
197 &PrintUsage::print_txt_usage($options, "{explode.params}");
198 print STDERR "You need to specify a plugin";
199 die "\n";
200 }
201
202 # check metadata set
203 if (defined $metadata_set && $metadata_set =~ /\w/) {
204 $metadata_set .= ".";
205 } else {
206 $metadata_set = "";
207 }
208 if (defined $collection && $collection =~ /\w/) {
209 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
210 print STDERR "Collection $collection does not exist\n";
211 die "\n";
212 }
213 } else {
214 undef $collection;
215 }
216
217 if ($use_collection_plugin_options) {
218 if (defined $plugin_options && $plugin_options =~ /\w/) {
219 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
220 die "\n";
221 }
222 if (not defined $collection) {
223 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
224 die "\n";
225 }
226 }
227
228 if (defined $collectdir) {
229 unshift (@INC, "$collectdir/$collection/perllib/plugins");
230 }
231
232 my $plugobj;
233 require "$plugin.pm";
234
235 my $plugin_options_string = "";
236 if ($use_collection_plugin_options) {
237 # read in the collect.cfg file
238 # Read in the collection configuration file.
239 my $gs_mode = "gs2";
240 if ((defined $site) && ($site ne "")) { # GS3
241 $gs_mode = "gs3";
242 }
243 my $configfilename = &colcfg::get_collect_cfg_name(STDERR, $gs_mode);
244 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
245 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
246 }
247 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
248 my @options = split(/\s/, $plugin_options);
249 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
250 $plugin_options_string= join (",", @options);
251 }
252
253 if ($plugin_options_string eq "") {
254 eval ("\$plugobj = new $plugin()");
255 die "$@" if $@;
256 } else {
257 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
258 die "$@" if $@;
259 }
260
261 # ...and initialize it
262 $plugobj->init($verbosity, "STDERR", "STDERR");
263
264 if ($input_encoding eq "auto") {
265 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
266 }
267
268 # Create a directory to store the document files...
269 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
270
271 my $orig_base_dir = &File::Basename::dirname($filename);
272
273
274 my $split_exp = $plugobj->{'split_exp'};
275 if (defined $split_exp) {
276
277 # Read in file, and then split and process individual records
278
279 my $text = "";
280 # Use the plugin's read_file function to avoid duplicating code
281 $plugobj->read_file($filename, $input_encoding, undef, \$text);
282 # is there any text in the file??
283 die "\n" unless length($text);
284
285 # Split the text into records, using the plugin's split_exp
286
287 ##my @metadata_records = split(/$split_exp/, $text);
288 my $metadata_records_ref = $plugobj->split_text_into_segments(\$text);
289 my @metadata_records = @$metadata_records_ref;
290
291 my $total_num_records = scalar(@metadata_records);
292 print STDERR "Number of records: $total_num_records\n";
293
294 # Write the metadata from each record to the metadata.xml file
295 my $record_number = 1;
296 my $documents_directory;
297 foreach my $record_text (@metadata_records) {
298
299 # Check if we need to start a new directory for these records
300 check_need_new_directory($exploded_base_dir,$record_number,
301 $records_per_folder,$total_num_records,
302 \$documents_directory);
303 # Use the plugin's process function to avoid duplicating code
304 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
305 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
306
307
308 # Try to get a doc to attach the metadata to
309 # If no match found, create a dummy .nul file
310 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
311 $documents_directory, $orig_base_dir,
312 $document_prefix, $document_suffix, $metadata_set, $verbosity);
313
314
315 check_close_directory($record_number,$records_per_folder,$total_num_records);
316
317 $record_number = $record_number + 1;
318 }
319 }
320 else {
321 # Call metadata_read to set up associated metadata
322
323 my $pluginfo = undef;
324 my $block_hash = {};
325
326 my $processor = undef;
327 my $maxdocs = undef;
328 my $gli = undef;
329
330 my $extrametakeys = [];
331 my $extrametadata = {};
332 my $extrametafile = {};
333
334 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
335 $extrametakeys, $extrametadata, $extrametafile,
336 $processor, $maxdocs, $gli);
337
338 my $total_num_records = scalar (@$extrametakeys);
339 print STDERR "Number of records: $total_num_records\n";
340 my $record_number = 1;
341 my $documents_directory;
342 foreach my $record (@$extrametakeys) {
343 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
344
345 # Attach metadata to object
346 # => use the plugin's extra_metadata function to avoid duplicating code
347 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
348 # all the metadata has been extracted into extrametadata
349 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), &extrametautil::getmetadata($extrametadata, $record));
350
351 # Try to get a doc to attach the metadata to
352 # If no match found, create a dummy .nul file
353 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
354
355 &check_close_directory($record_number,$records_per_folder,$total_num_records);
356
357 $record_number = $record_number + 1;
358
359 }
360 }
361
362 # Explode means just that: the original file is deleted
363 &FileUtils::removeFiles($filename);
364 $plugobj->clean_up_after_exploding();
365
366}
367
368
369sub need_new_directory
370{
371 my ($exploded_base_dir) = @_;
372
373 my $documents_directory = $exploded_base_dir;
374
375 if (-d $documents_directory) {
376 die "Error: document directory $documents_directory already exists (bailing).\n";
377 }
378 &FileUtils::makeDirectory($documents_directory);
379
380 my $documents_metadata_xml_file = &FileUtils::filenameConcatenate($documents_directory, "metadata.xml");
381 if (-e $documents_metadata_xml_file) {
382 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
383 }
384
385 # Start the metadata.xml file
386 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
387 binmode METADATA_XML_FILE, ":utf8";
388 print METADATA_XML_FILE
389 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
390 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
391 "<DirectoryMetadata>\n";
392
393 return $documents_directory;
394}
395
396sub check_need_new_directory
397{
398 my ($exploded_base_dir,$record_number, $records_per_folder,
399 $total_num_records, $documents_dir_ref) = @_;
400
401
402 # Check if we need to start a new directory for these records
403 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
404 my $documents_directory = $exploded_base_dir;
405
406 if ($total_num_records > $records_per_folder) {
407 $documents_directory .= "." . sprintf("%8.8d", $record_number);
408 }
409
410 $$documents_dir_ref = need_new_directory($documents_directory);
411 }
412}
413
414
415
416
417
418sub attach_metadata_or_make_nul_doc
419{
420 my ($document_field, $doc_obj, $record_number,
421 $documents_directory, $orig_base_dir,
422 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
423
424 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
425 my $document_file;
426
427 # try to get a doc to attach the metadata to
428 if (defined $document_field) {
429 foreach my $pair (@$record_metadata) {
430 my ($field, $value) = (@$pair);
431 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
432 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
433 my $document_file_full;
434
435 # Does this metadata element specify a document to obtain?
436 if ($field eq $document_field) {
437 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
438 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
439 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
440 $document_prefix =~ s/(\/|\\)$//;
441 $document_file_full = &FileUtils::filenameConcatenate($document_prefix, "$value$document_suffix");
442 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
443 $document_file_full = $document_prefix . $value . $document_suffix;
444 }
445
446 # this either downloads/copies the document, or creates a nul file for it.
447 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
448 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
449 }
450 }
451 }
452
453 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
454 if (not defined $document_file) {
455
456 my $doc_oid = $doc_obj->get_OID();
457
458 if (defined ($doc_oid)) {
459 $document_file = "$doc_oid.nul";
460 }
461 elsif (defined ($record_number)) {
462 $document_file = sprintf("%8.8d", $record_number) . ".nul";
463 }
464 else {
465 $document_file = "doc.nul";
466 }
467 open(DUMMY_FILE, ">$documents_directory/$document_file");
468 close(DUMMY_FILE);
469 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
470 }
471
472}
473
474sub close_directory
475{
476 # Finish and close the metadata.xml file
477 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
478 close(METADATA_XML_FILE);
479
480}
481
482
483sub check_close_directory
484{
485 my ($record_number,$records_per_folder,$total_num_records) = @_;
486
487 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
488 # Finish and close the metadata.xml file
489 close_directory();
490 }
491}
492
493
494
495sub write_metadata_xml_file_entry
496{
497 my $metadata_xml_file = shift(@_);
498 my $file_name = shift(@_);
499 my $record_metadata = shift(@_);
500 my $meta_prefix = shift(@_);
501
502 # Make $file_name XML-safe
503 $file_name =~ s/&/&amp;/g;
504 $file_name =~ s/</&lt;/g;
505 $file_name =~ s/>/&gt;/g;
506
507 # Convert $file_name into a regular expression that matches it
508 $file_name =~ s/\./\\\./g;
509 $file_name =~ s/\(/\\\(/g;
510 $file_name =~ s/\)/\\\)/g;
511 $file_name =~ s/\{/\\\{/g;
512 $file_name =~ s/\}/\\\}/g;
513 $file_name =~ s/\[/\\\[/g;
514 $file_name =~ s/\]/\\\]/g;
515
516 print $metadata_xml_file
517 "\n" .
518 " <FileSet>\n" .
519 " <FileName>$file_name</FileName>\n" .
520 " <Description>\n";
521
522 foreach my $pair (@$record_metadata) {
523 my ($field, $value) = (@$pair);
524
525 # We're only interested in metadata from the database
526 next if ($field eq "lastmodified");
527 next if ($field eq "gsdlsourcefilename");
528 next if ($field eq "gsdldoctype");
529 next if ($field eq "FileFormat");
530
531 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
532 next if ($field =~ /\^all$/); # ISISPlug specific!
533
534 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
535
536 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
537 $value =~ s/\[/&\#091;/g;
538 $value =~ s/\]/&\#093;/g;
539
540 # Make $value XML-safe
541 $value =~ s/&/&amp;/g; # May mess up existing entities!
542 $value =~ s/</&lt;/g;
543 $value =~ s/>/&gt;/g;
544
545 # we are not allowed & in xml except in entities.
546 # if there are undefined entities then parsing will also crap out.
547 # should we be checking for them too?
548 # this may not get all possibilities
549 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
550
551 # do we already have a namespace specified?
552 my $full_field = $field;
553 if ($meta_prefix ne "") {
554 $full_field =~ s/^\w+\.//;
555 $full_field = $meta_prefix.$full_field;
556 }
557
558 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
559 }
560
561 print $metadata_xml_file
562 " </Description>\n" .
563 " </FileSet>\n";
564}
565
566sub obtain_document
567{
568 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
569
570 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
571
572 my $document_file_name;
573 my $local_document_file;
574
575 # Document specified is on the web
576 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
577 $document_file_full =~ /([^\/]+)$/;
578 $document_file_name = $1;
579 $local_document_file = &FileUtils::filenameConcatenate($documents_directory, $document_file_name);
580
581 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
582 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
583
584 my $wget_options = "--quiet";
585 $wget_options = "--verbose" if ($verbosity > 2);
586 $wget_options .= " --timestamping"; # Only re-download files if they're newer
587 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
588 `$wget_command`;
589
590 # Check the document was obtained successfully
591 if (!-e $local_document_file) {
592 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
593 }
594 }
595 # Document specified is on the disk
596 else {
597 # convert the dirseps in filepath to correct dir sep for OS
598 $document_file_full = &FileUtils::filenameConcatenate($document_file_full);
599 my $dir_sep = &util::get_os_dirsep();
600
601 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
602 $document_file_name = $2;
603
604
605 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
606 print STDERR "doc file full = $document_file_full\n";
607
608 if (!$is_absolute) {
609 $document_file_full
610 = &FileUtils::filenameConcatenate($orig_base_dir,$document_file_full);
611 }
612
613 $local_document_file = &FileUtils::filenameConcatenate($documents_directory, $document_file_name);
614
615 if (-e $document_file_full) {
616 &FileUtils::copyFiles($document_file_full, $documents_directory);
617 }
618
619 # Check the document was obtained successfully
620 if (!-e $local_document_file) {
621 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
622 }
623 else {
624 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
625 if ($document_file_full =~ m/^$orig_base_dir.*/) {
626 # file local to metadata record
627 # => copy has been made successfully, so remove original
628 &FileUtils::removeFiles($document_file_full);
629 }
630 }
631 }
632
633 # If the document wasn't obtained successfully, create a .nul file for it
634 if (!-e $local_document_file) {
635 $document_file_name .= ".nul";
636 open(NULL_FILE, ">$local_document_file.nul");
637 close(NULL_FILE);
638 print STDERR "Creating a nul document $document_file_name\n";
639 }
640
641 return $document_file_name;
642}
643
644sub get_plugin_options {
645 my ($collectcfg, $plugin) = @_;
646
647 my $plugin_list = $collectcfg ->{'plugin'};
648
649 foreach my $pluginoptions (@$plugin_list) {
650 my $pluginname = shift @$pluginoptions;
651 next unless $pluginname eq $plugin;
652 map { $_ = "\"$_\""; } @$pluginoptions;
653 my $options = join (",", @$pluginoptions);
654 return $options;
655 }
656 return "";
657}
658
659&main(@ARGV);
660
Note: See TracBrowser for help on using the repository browser.