source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 26567

Last change on this file since 26567 was 26567, checked in by ak19, 11 years ago

When a GS2 collection contains both collect.cfg and collectionConfig.xml (as advanced beatles does) the old code used to end up reading in the GS3 collectionConfig.xml instead of the GS2 collect.cfg and set the GS_mode to GS3. Now colcfg::get_collect_cfg_name takes the gs_mode (instead of determining this and returning it) and works out the collectcfg file name for the gs_mode. That means that the calling functions now need to work out the gs_mode. They do so by setting the gs_mode to gs3 if the site flag is present in the commandline, if not then it defaults to gs2. So from now on, the site flag must be specified for GS3 collections.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.5 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use extrametautil;
16use printusage;
17use parse2;
18use colcfg;
19
20use FileHandle;
21
22use File::Spec;
23use File::Basename;
24
25my $unicode_list =
26 [ { 'name' => "auto",
27 'desc' => "{ReadTextFile.input_encoding.auto}" },
28 { 'name' => "ascii",
29 'desc' => "{BasePlugin.encoding.ascii}" },
30 { 'name' => "utf8",
31 'desc' => "{BasePlugin.encoding.utf8}" },
32 { 'name' => "unicode",
33 'desc' => "{BasePlugin.encoding.unicode}" } ];
34
35my $e = $encodings::encodings;
36foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
37{
38 my $hashEncode =
39 {'name' => $enc,
40 'desc' => $e->{$enc}->{'name'}};
41
42 push(@{$unicode_list},$hashEncode);
43}
44
45my $arguments =
46 [
47 { 'name' => "language",
48 'desc' => "{scripts.language}",
49 'type' => "string",
50 'reqd' => "no",
51 'hiddengli' => "yes" },
52 { 'name' => "plugin",
53 'desc' => "{explode.plugin}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "input_encoding",
58 'desc' => "{explode.encoding}",
59 'type' => "enum",
60 'deft' => "auto",
61 'list' => $unicode_list,
62 'reqd' => "no" },
63 { 'name' => "metadata_set",
64 'desc' => "{explode.metadata_set}",
65 'type' => "string",
66 'reqd' => "no" },
67 { 'name' => "document_field",
68 'desc' => "{explode.document_field}",
69 'type' => "string",
70 'reqd' => "no"},
71 { 'name' => "document_prefix",
72 'desc' => "{explode.document_prefix}",
73 'type' => "string",
74 'reqd' => "no"},
75 { 'name' => "document_suffix",
76 'desc' => "{explode.document_suffix}",
77 'type' => "string",
78 'reqd' => "no"},
79 { 'name' => "records_per_folder",
80 'desc' => "{explode.records_per_folder}",
81 'type' => "int",
82 'range' => "0,",
83 'deft' => "100",
84 'reqd' => "no" },
85 { 'name' => "collectdir",
86 'desc' => "{import.collectdir}",
87 'type' => "string",
88 # parsearg left "" as default
89 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
90 'deft' => "",
91 'reqd' => "no",
92 'hiddengli' => "yes" },
93 { 'name' => "site",
94 'desc' => "{import.site}",
95 'type' => "string",
96 'deft' => "",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "collection",
100 'desc' => "{explode.collection}",
101 'type' => "string",
102 'reqd' => "no",
103 'hiddengli' => "yes"},
104 { 'name' => "use_collection_plugin_options",
105 'desc' => "{explode.use_collection_plugin_options}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes"},
109 { 'name' => "plugin_options",
110 'desc' => "{explode.plugin_options}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes"},
114 { 'name' => "verbosity",
115 'desc' => "{import.verbosity}",
116 'type' => "int",
117 'range' => "0,",
118 'deft' => "1",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "xml",
122 'desc' => "",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" }
126 ];
127
128my $options = { 'name' => "explode_metadata_database.pl",
129 'desc' => "{explode.desc}",
130 'args' => $arguments };
131
132
133
134sub main
135{
136 my ($language, $input_encoding, $metadata_set, $plugin,
137 $document_field, $document_prefix, $document_suffix,
138 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
139 $use_collection_plugin_options, $verbosity);
140
141 my $xml = 0;
142
143 my $hashParsingResult = {};
144 # parse the options
145 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
146
147 # If parse returns -1 then something has gone wrong
148 if ($intArgLeftinAfterParsing == -1)
149 {
150 &PrintUsage::print_txt_usage($options, "{explode.params}");
151 die "\n";
152 }
153
154 foreach my $strVariable (keys %$hashParsingResult)
155 {
156 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
157 }
158
159 # If $language has been specified, load the appropriate resource bundle
160 # (Otherwise, the default resource bundle will be loaded automatically)
161 if ($language && $language =~ /\S/) {
162 &gsprintf::load_language_specific_resource_bundle($language);
163 }
164
165 if ($xml) {
166 &PrintUsage::print_xml_usage($options);
167 print "\n";
168 return;
169 }
170
171
172 # There should one arg left after parsing (the filename)
173 # Or the user may have specified -h, in which case we output the usage
174 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
175 {
176 &PrintUsage::print_txt_usage($options, "{explode.params}");
177 die "\n";
178 }
179
180 # The metadata database filename is the first value that remains after the options have been parsed out
181 my $filename = $ARGV[0];
182 if (!defined $filename || $filename !~ /\w/) {
183 &PrintUsage::print_txt_usage($options, "{explode.params}");
184 print STDERR "You need to specify a filename";
185 die "\n";
186 }
187 # check that file exists
188 if (!-e $filename) {
189 print STDERR "File $filename doesn't exist...\n";
190 die "\n";
191 }
192 # check required options
193 if (!defined $plugin || $plugin !~ /\w/) {
194 &PrintUsage::print_txt_usage($options, "{explode.params}");
195 print STDERR "You need to specify a plugin";
196 die "\n";
197 }
198
199 # check metadata set
200 if (defined $metadata_set && $metadata_set =~ /\w/) {
201 $metadata_set .= ".";
202 } else {
203 $metadata_set = "";
204 }
205 if (defined $collection && $collection =~ /\w/) {
206 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
207 print STDERR "Collection $collection does not exist\n";
208 die "\n";
209 }
210 } else {
211 undef $collection;
212 }
213
214 if ($use_collection_plugin_options) {
215 if (defined $plugin_options && $plugin_options =~ /\w/) {
216 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
217 die "\n";
218 }
219 if (not defined $collection) {
220 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
221 die "\n";
222 }
223 }
224 my $plugobj;
225 require "$plugin.pm";
226
227 my $plugin_options_string = "";
228 if ($use_collection_plugin_options) {
229 # read in the collect.cfg file
230 # Read in the collection configuration file.
231 my $gs_mode = "gs2";
232 if ((defined $site) && ($site ne "")) { # GS3
233 $gs_mode = "gs3";
234 }
235 my $configfilename = &colcfg::get_collect_cfg_name(STDERR, $gs_mode);
236 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
237 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
238 }
239 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
240 my @options = split(/\s/, $plugin_options);
241 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
242 $plugin_options_string= join (",", @options);
243 }
244
245 if ($plugin_options_string eq "") {
246 eval ("\$plugobj = new $plugin()");
247 die "$@" if $@;
248 } else {
249 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
250 die "$@" if $@;
251 }
252
253 # ...and initialize it
254 $plugobj->init($verbosity, "STDERR", "STDERR");
255
256 if ($input_encoding eq "auto") {
257 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
258 }
259
260 # Create a directory to store the document files...
261 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
262
263 my $orig_base_dir = &File::Basename::dirname($filename);
264
265
266 my $split_exp = $plugobj->{'split_exp'};
267 if (defined $split_exp) {
268 # Read in file, and then split and process individual records
269
270 my $text = "";
271 # Use the plugin's read_file function to avoid duplicating code
272 $plugobj->read_file($filename, $input_encoding, undef, \$text);
273 # is there any text in the file??
274 die "\n" unless length($text);
275
276 # Split the text into records, using the plugin's split_exp
277
278 my @metadata_records = split(/$split_exp/, $text);
279 my $total_num_records = scalar(@metadata_records);
280 print STDERR "Number of records: $total_num_records\n";
281
282 # Write the metadata from each record to the metadata.xml file
283 my $record_number = 1;
284 my $documents_directory;
285 foreach my $record_text (@metadata_records) {
286
287 # Check if we need to start a new directory for these records
288 check_need_new_directory($exploded_base_dir,$record_number,
289 $records_per_folder,$total_num_records,
290 \$documents_directory);
291 # Use the plugin's process function to avoid duplicating code
292 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
293 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
294
295
296 # Try to get a doc to attach the metadata to
297 # If no match found, create a dummy .nul file
298 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
299 $documents_directory, $orig_base_dir,
300 $document_prefix, $document_suffix, $metadata_set, $verbosity);
301
302
303 check_close_directory($record_number,$records_per_folder,$total_num_records);
304
305 $record_number = $record_number + 1;
306 }
307 }
308 else {
309 # Call metadata_read to set up associated metadata
310
311 my $pluginfo = undef;
312 my $block_hash = {};
313
314 my $processor = undef;
315 my $maxdocs = undef;
316 my $gli = undef;
317
318 my $extrametakeys = [];
319 my $extrametadata = {};
320 my $extrametafile = {};
321
322 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
323 $extrametakeys, $extrametadata, $extrametafile,
324 $processor, $maxdocs, $gli);
325
326 my $total_num_records = scalar (@$extrametakeys);
327 print STDERR "Number of records: $total_num_records\n";
328 my $record_number = 1;
329 my $documents_directory;
330 foreach my $record (@$extrametakeys) {
331 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
332
333 # Attach metadata to object
334 # => use the plugin's extra_metadata function to avoid duplicating code
335 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
336 # all the metadata has been extracted into extrametadata
337 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), &extrametautil::getmetadata($extrametadata, $record));
338
339 # Try to get a doc to attach the metadata to
340 # If no match found, create a dummy .nul file
341 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
342
343 &check_close_directory($record_number,$records_per_folder,$total_num_records);
344
345 $record_number = $record_number + 1;
346
347 }
348 }
349
350 # Explode means just that: the original file is deleted
351 &util::rm($filename);
352 $plugobj->clean_up_after_exploding();
353
354}
355
356
357sub need_new_directory
358{
359 my ($exploded_base_dir) = @_;
360
361 my $documents_directory = $exploded_base_dir;
362
363 if (-d $documents_directory) {
364 die "Error: document directory $documents_directory already exists (bailing).\n";
365 }
366 &util::mk_dir($documents_directory);
367
368 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
369 if (-e $documents_metadata_xml_file) {
370 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
371 }
372
373 # Start the metadata.xml file
374 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
375 print METADATA_XML_FILE
376 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
377 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
378 "<DirectoryMetadata>\n";
379
380 return $documents_directory;
381}
382
383sub check_need_new_directory
384{
385 my ($exploded_base_dir,$record_number, $records_per_folder,
386 $total_num_records, $documents_dir_ref) = @_;
387
388
389 # Check if we need to start a new directory for these records
390 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
391 my $documents_directory = $exploded_base_dir;
392
393 if ($total_num_records > $records_per_folder) {
394 $documents_directory .= "." . sprintf("%8.8d", $record_number);
395 }
396
397 $$documents_dir_ref = need_new_directory($documents_directory);
398 }
399}
400
401
402
403
404
405sub attach_metadata_or_make_nul_doc
406{
407 my ($document_field, $doc_obj, $record_number,
408 $documents_directory, $orig_base_dir,
409 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
410
411 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
412 my $document_file;
413
414 # try to get a doc to attach the metadata to
415 if (defined $document_field) {
416 foreach my $pair (@$record_metadata) {
417 my ($field, $value) = (@$pair);
418 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
419 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
420 my $document_file_full;
421
422 # Does this metadata element specify a document to obtain?
423 if ($field eq $document_field) {
424 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
425 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
426 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
427 $document_prefix =~ s/(\/|\\)$//;
428 $document_file_full = &util::filename_cat($document_prefix, "$value$document_suffix");
429 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
430 $document_file_full = $document_prefix . $value . $document_suffix;
431 }
432
433 # this either downloads/copies the document, or creates a nul file for it.
434 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
435 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
436 }
437 }
438 }
439
440 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
441 if (not defined $document_file) {
442
443 if (defined ($record_number)) {
444 $document_file = sprintf("%8.8d", $record_number) . ".nul";
445 }
446 else {
447 $document_file = "doc.nul";
448 }
449 open(DUMMY_FILE, ">$documents_directory/$document_file");
450 close(DUMMY_FILE);
451 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
452 }
453
454}
455
456sub close_directory
457{
458 # Finish and close the metadata.xml file
459 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
460 close(METADATA_XML_FILE);
461
462}
463
464
465sub check_close_directory
466{
467 my ($record_number,$records_per_folder,$total_num_records) = @_;
468
469 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
470 # Finish and close the metadata.xml file
471 close_directory();
472 }
473}
474
475
476
477sub write_metadata_xml_file_entry
478{
479 my $metadata_xml_file = shift(@_);
480 my $file_name = shift(@_);
481 my $record_metadata = shift(@_);
482 my $meta_prefix = shift(@_);
483
484 # Make $file_name XML-safe
485 $file_name =~ s/&/&amp;/g;
486 $file_name =~ s/</&lt;/g;
487 $file_name =~ s/>/&gt;/g;
488
489 # Convert $file_name into a regular expression that matches it
490 $file_name =~ s/\./\\\./g;
491 $file_name =~ s/\(/\\\(/g;
492 $file_name =~ s/\)/\\\)/g;
493 $file_name =~ s/\{/\\\{/g;
494 $file_name =~ s/\}/\\\}/g;
495 $file_name =~ s/\[/\\\[/g;
496 $file_name =~ s/\]/\\\]/g;
497
498 print $metadata_xml_file
499 "\n" .
500 " <FileSet>\n" .
501 " <FileName>$file_name</FileName>\n" .
502 " <Description>\n";
503
504 foreach my $pair (@$record_metadata) {
505 my ($field, $value) = (@$pair);
506
507 # We're only interested in metadata from the database
508 next if ($field eq "lastmodified");
509 next if ($field eq "gsdlsourcefilename");
510 next if ($field eq "gsdldoctype");
511 next if ($field eq "FileFormat");
512
513 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
514 next if ($field =~ /\^all$/); # ISISPlug specific!
515
516 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
517
518 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
519 $value =~ s/\[/&\#091;/g;
520 $value =~ s/\]/&\#093;/g;
521
522 # Make $value XML-safe
523 $value =~ s/&/&amp;/g; # May mess up existing entities!
524 $value =~ s/</&lt;/g;
525 $value =~ s/>/&gt;/g;
526
527 # we are not allowed & in xml except in entities.
528 # if there are undefined entities then parsing will also crap out.
529 # should we be checking for them too?
530 # this may not get all possibilities
531 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
532
533 # do we already have a namespace specified?
534 my $full_field = $field;
535 if ($meta_prefix ne "") {
536 $full_field =~ s/^\w+\.//;
537 $full_field = $meta_prefix.$full_field;
538 }
539
540 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
541 }
542
543 print $metadata_xml_file
544 " </Description>\n" .
545 " </FileSet>\n";
546}
547
548sub obtain_document
549{
550 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
551
552 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
553
554 my $document_file_name;
555 my $local_document_file;
556
557 # Document specified is on the web
558 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
559 $document_file_full =~ /([^\/]+)$/;
560 $document_file_name = $1;
561 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
562
563 my $wget_options = "--quiet";
564 $wget_options = "--verbose" if ($verbosity > 2);
565 $wget_options .= " --timestamping"; # Only re-download files if they're newer
566 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
567 `$wget_command`;
568
569 # Check the document was obtained successfully
570 if (!-e $local_document_file) {
571 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
572 }
573 }
574 # Document specified is on the disk
575 else {
576 # convert the dirseps in filepath to correct dir sep for OS
577 $document_file_full = &util::filename_cat($document_file_full);
578 my $dir_sep = &util::get_os_dirsep();
579
580 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
581 $document_file_name = $2;
582
583
584 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
585 print STDERR "doc file full = $document_file_full\n";
586
587 if (!$is_absolute) {
588 $document_file_full
589 = &util::filename_cat($orig_base_dir,$document_file_full);
590 }
591
592 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
593
594 if (-e $document_file_full) {
595 &util::cp($document_file_full, $documents_directory);
596 }
597
598 # Check the document was obtained successfully
599 if (!-e $local_document_file) {
600 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
601 }
602 else {
603 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
604 if ($document_file_full =~ m/^$orig_base_dir.*/) {
605 # file local to metadata record
606 # => copy has been made successfully, so remove original
607 &util::rm($document_file_full);
608 }
609 }
610 }
611
612 # If the document wasn't obtained successfully, create a .nul file for it
613 if (!-e $local_document_file) {
614 $document_file_name .= ".nul";
615 open(NULL_FILE, ">$local_document_file.nul");
616 close(NULL_FILE);
617 print STDERR "Creating a nul document $document_file_name\n";
618 }
619
620 return $document_file_name;
621}
622
623sub get_plugin_options {
624 my ($collectcfg, $plugin) = @_;
625
626 my $plugin_list = $collectcfg ->{'plugin'};
627
628 foreach my $pluginoptions (@$plugin_list) {
629 my $pluginname = shift @$pluginoptions;
630 next unless $pluginname eq $plugin;
631 map { $_ = "\"$_\""; } @$pluginoptions;
632 my $options = join (",", @$pluginoptions);
633 return $options;
634 }
635 return "";
636}
637
638&main(@ARGV);
639
Note: See TracBrowser for help on using the repository browser.