source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 27901

Last change on this file since 27901 was 27901, checked in by ak19, 11 years ago

Bugfix to MARCPlugin so that the marc tutorial now works: exploding didn't handle the special characters properly before. Followed Dr Bainbridge's suggestion of tracking down where the exploding the MARC file wrote out the metadata.xml files and added in that the out file handle should be in utf8 binmode.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.5 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use extrametautil;
16use printusage;
17use parse2;
18use colcfg;
19
20use FileHandle;
21
22use File::Spec;
23use File::Basename;
24
25my $unicode_list =
26 [ { 'name' => "auto",
27 'desc' => "{ReadTextFile.input_encoding.auto}" },
28 { 'name' => "ascii",
29 'desc' => "{BasePlugin.encoding.ascii}" },
30 { 'name' => "utf8",
31 'desc' => "{BasePlugin.encoding.utf8}" },
32 { 'name' => "unicode",
33 'desc' => "{BasePlugin.encoding.unicode}" } ];
34
35my $e = $encodings::encodings;
36foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
37{
38 my $hashEncode =
39 {'name' => $enc,
40 'desc' => $e->{$enc}->{'name'}};
41
42 push(@{$unicode_list},$hashEncode);
43}
44
45my $arguments =
46 [
47 { 'name' => "language",
48 'desc' => "{scripts.language}",
49 'type' => "string",
50 'reqd' => "no",
51 'hiddengli' => "yes" },
52 { 'name' => "plugin",
53 'desc' => "{explode.plugin}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "input_encoding",
58 'desc' => "{explode.encoding}",
59 'type' => "enum",
60 'deft' => "auto",
61 'list' => $unicode_list,
62 'reqd' => "no" },
63 { 'name' => "metadata_set",
64 'desc' => "{explode.metadata_set}",
65 'type' => "string",
66 'reqd' => "no" },
67 { 'name' => "document_field",
68 'desc' => "{explode.document_field}",
69 'type' => "string",
70 'reqd' => "no"},
71 { 'name' => "document_prefix",
72 'desc' => "{explode.document_prefix}",
73 'type' => "string",
74 'reqd' => "no"},
75 { 'name' => "document_suffix",
76 'desc' => "{explode.document_suffix}",
77 'type' => "string",
78 'reqd' => "no"},
79 { 'name' => "records_per_folder",
80 'desc' => "{explode.records_per_folder}",
81 'type' => "int",
82 'range' => "0,",
83 'deft' => "100",
84 'reqd' => "no" },
85 { 'name' => "collectdir",
86 'desc' => "{import.collectdir}",
87 'type' => "string",
88 # parsearg left "" as default
89 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
90 'deft' => "",
91 'reqd' => "no",
92 'hiddengli' => "yes" },
93 { 'name' => "site",
94 'desc' => "{import.site}",
95 'type' => "string",
96 'deft' => "",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "collection",
100 'desc' => "{explode.collection}",
101 'type' => "string",
102 'reqd' => "no",
103 'hiddengli' => "yes"},
104 { 'name' => "use_collection_plugin_options",
105 'desc' => "{explode.use_collection_plugin_options}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes"},
109 { 'name' => "plugin_options",
110 'desc' => "{explode.plugin_options}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes"},
114 { 'name' => "verbosity",
115 'desc' => "{import.verbosity}",
116 'type' => "int",
117 'range' => "0,",
118 'deft' => "1",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "xml",
122 'desc' => "",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" }
126 ];
127
128my $options = { 'name' => "explode_metadata_database.pl",
129 'desc' => "{explode.desc}",
130 'args' => $arguments };
131
132
133
134sub main
135{
136 my ($language, $input_encoding, $metadata_set, $plugin,
137 $document_field, $document_prefix, $document_suffix,
138 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
139 $use_collection_plugin_options, $verbosity);
140
141 my $xml = 0;
142
143 my $hashParsingResult = {};
144 # parse the options
145 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
146
147 # If parse returns -1 then something has gone wrong
148 if ($intArgLeftinAfterParsing == -1)
149 {
150 &PrintUsage::print_txt_usage($options, "{explode.params}");
151 die "\n";
152 }
153
154 foreach my $strVariable (keys %$hashParsingResult)
155 {
156 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
157 }
158
159 # If $language has been specified, load the appropriate resource bundle
160 # (Otherwise, the default resource bundle will be loaded automatically)
161 if ($language && $language =~ /\S/) {
162 &gsprintf::load_language_specific_resource_bundle($language);
163 }
164
165 if ($xml) {
166 &PrintUsage::print_xml_usage($options);
167 print "\n";
168 return;
169 }
170
171
172 # There should one arg left after parsing (the filename)
173 # Or the user may have specified -h, in which case we output the usage
174 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
175 {
176 &PrintUsage::print_txt_usage($options, "{explode.params}");
177 die "\n";
178 }
179
180 # The metadata database filename is the first value that remains after the options have been parsed out
181 my $filename = $ARGV[0];
182 if (!defined $filename || $filename !~ /\w/) {
183 &PrintUsage::print_txt_usage($options, "{explode.params}");
184 print STDERR "You need to specify a filename";
185 die "\n";
186 }
187 # check that file exists
188 if (!-e $filename) {
189 print STDERR "File $filename doesn't exist...\n";
190 die "\n";
191 }
192 # check required options
193 if (!defined $plugin || $plugin !~ /\w/) {
194 &PrintUsage::print_txt_usage($options, "{explode.params}");
195 print STDERR "You need to specify a plugin";
196 die "\n";
197 }
198
199 # check metadata set
200 if (defined $metadata_set && $metadata_set =~ /\w/) {
201 $metadata_set .= ".";
202 } else {
203 $metadata_set = "";
204 }
205 if (defined $collection && $collection =~ /\w/) {
206 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
207 print STDERR "Collection $collection does not exist\n";
208 die "\n";
209 }
210 } else {
211 undef $collection;
212 }
213
214 if ($use_collection_plugin_options) {
215 if (defined $plugin_options && $plugin_options =~ /\w/) {
216 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
217 die "\n";
218 }
219 if (not defined $collection) {
220 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
221 die "\n";
222 }
223 }
224 my $plugobj;
225 require "$plugin.pm";
226
227 my $plugin_options_string = "";
228 if ($use_collection_plugin_options) {
229 # read in the collect.cfg file
230 # Read in the collection configuration file.
231 my $gs_mode = "gs2";
232 if ((defined $site) && ($site ne "")) { # GS3
233 $gs_mode = "gs3";
234 }
235 my $configfilename = &colcfg::get_collect_cfg_name(STDERR, $gs_mode);
236 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
237 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
238 }
239 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
240 my @options = split(/\s/, $plugin_options);
241 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
242 $plugin_options_string= join (",", @options);
243 }
244
245 if ($plugin_options_string eq "") {
246 eval ("\$plugobj = new $plugin()");
247 die "$@" if $@;
248 } else {
249 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
250 die "$@" if $@;
251 }
252
253 # ...and initialize it
254 $plugobj->init($verbosity, "STDERR", "STDERR");
255
256 if ($input_encoding eq "auto") {
257 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
258 }
259
260 # Create a directory to store the document files...
261 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
262
263 my $orig_base_dir = &File::Basename::dirname($filename);
264
265
266 my $split_exp = $plugobj->{'split_exp'};
267 if (defined $split_exp) {
268 # Read in file, and then split and process individual records
269
270 my $text = "";
271 # Use the plugin's read_file function to avoid duplicating code
272 $plugobj->read_file($filename, $input_encoding, undef, \$text);
273 # is there any text in the file??
274 die "\n" unless length($text);
275
276 # Split the text into records, using the plugin's split_exp
277
278 my @metadata_records = split(/$split_exp/, $text);
279 my $total_num_records = scalar(@metadata_records);
280 print STDERR "Number of records: $total_num_records\n";
281
282 # Write the metadata from each record to the metadata.xml file
283 my $record_number = 1;
284 my $documents_directory;
285 foreach my $record_text (@metadata_records) {
286
287 # Check if we need to start a new directory for these records
288 check_need_new_directory($exploded_base_dir,$record_number,
289 $records_per_folder,$total_num_records,
290 \$documents_directory);
291 # Use the plugin's process function to avoid duplicating code
292 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
293 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
294
295
296 # Try to get a doc to attach the metadata to
297 # If no match found, create a dummy .nul file
298 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
299 $documents_directory, $orig_base_dir,
300 $document_prefix, $document_suffix, $metadata_set, $verbosity);
301
302
303 check_close_directory($record_number,$records_per_folder,$total_num_records);
304
305 $record_number = $record_number + 1;
306 }
307 }
308 else {
309 # Call metadata_read to set up associated metadata
310
311 my $pluginfo = undef;
312 my $block_hash = {};
313
314 my $processor = undef;
315 my $maxdocs = undef;
316 my $gli = undef;
317
318 my $extrametakeys = [];
319 my $extrametadata = {};
320 my $extrametafile = {};
321
322 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
323 $extrametakeys, $extrametadata, $extrametafile,
324 $processor, $maxdocs, $gli);
325
326 my $total_num_records = scalar (@$extrametakeys);
327 print STDERR "Number of records: $total_num_records\n";
328 my $record_number = 1;
329 my $documents_directory;
330 foreach my $record (@$extrametakeys) {
331 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
332
333 # Attach metadata to object
334 # => use the plugin's extra_metadata function to avoid duplicating code
335 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
336 # all the metadata has been extracted into extrametadata
337 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), &extrametautil::getmetadata($extrametadata, $record));
338
339 # Try to get a doc to attach the metadata to
340 # If no match found, create a dummy .nul file
341 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
342
343 &check_close_directory($record_number,$records_per_folder,$total_num_records);
344
345 $record_number = $record_number + 1;
346
347 }
348 }
349
350 # Explode means just that: the original file is deleted
351 &util::rm($filename);
352 $plugobj->clean_up_after_exploding();
353
354}
355
356
357sub need_new_directory
358{
359 my ($exploded_base_dir) = @_;
360
361 my $documents_directory = $exploded_base_dir;
362
363 if (-d $documents_directory) {
364 die "Error: document directory $documents_directory already exists (bailing).\n";
365 }
366 &util::mk_dir($documents_directory);
367
368 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
369 if (-e $documents_metadata_xml_file) {
370 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
371 }
372
373 # Start the metadata.xml file
374 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
375 binmode METADATA_XML_FILE, ":utf8";
376 print METADATA_XML_FILE
377 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
378 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
379 "<DirectoryMetadata>\n";
380
381 return $documents_directory;
382}
383
384sub check_need_new_directory
385{
386 my ($exploded_base_dir,$record_number, $records_per_folder,
387 $total_num_records, $documents_dir_ref) = @_;
388
389
390 # Check if we need to start a new directory for these records
391 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
392 my $documents_directory = $exploded_base_dir;
393
394 if ($total_num_records > $records_per_folder) {
395 $documents_directory .= "." . sprintf("%8.8d", $record_number);
396 }
397
398 $$documents_dir_ref = need_new_directory($documents_directory);
399 }
400}
401
402
403
404
405
406sub attach_metadata_or_make_nul_doc
407{
408 my ($document_field, $doc_obj, $record_number,
409 $documents_directory, $orig_base_dir,
410 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
411
412 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
413 my $document_file;
414
415 # try to get a doc to attach the metadata to
416 if (defined $document_field) {
417 foreach my $pair (@$record_metadata) {
418 my ($field, $value) = (@$pair);
419 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
420 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
421 my $document_file_full;
422
423 # Does this metadata element specify a document to obtain?
424 if ($field eq $document_field) {
425 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
426 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
427 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
428 $document_prefix =~ s/(\/|\\)$//;
429 $document_file_full = &util::filename_cat($document_prefix, "$value$document_suffix");
430 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
431 $document_file_full = $document_prefix . $value . $document_suffix;
432 }
433
434 # this either downloads/copies the document, or creates a nul file for it.
435 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
436 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
437 }
438 }
439 }
440
441 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
442 if (not defined $document_file) {
443
444 if (defined ($record_number)) {
445 $document_file = sprintf("%8.8d", $record_number) . ".nul";
446 }
447 else {
448 $document_file = "doc.nul";
449 }
450 open(DUMMY_FILE, ">$documents_directory/$document_file");
451 close(DUMMY_FILE);
452 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
453 }
454
455}
456
457sub close_directory
458{
459 # Finish and close the metadata.xml file
460 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
461 close(METADATA_XML_FILE);
462
463}
464
465
466sub check_close_directory
467{
468 my ($record_number,$records_per_folder,$total_num_records) = @_;
469
470 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
471 # Finish and close the metadata.xml file
472 close_directory();
473 }
474}
475
476
477
478sub write_metadata_xml_file_entry
479{
480 my $metadata_xml_file = shift(@_);
481 my $file_name = shift(@_);
482 my $record_metadata = shift(@_);
483 my $meta_prefix = shift(@_);
484
485 # Make $file_name XML-safe
486 $file_name =~ s/&/&amp;/g;
487 $file_name =~ s/</&lt;/g;
488 $file_name =~ s/>/&gt;/g;
489
490 # Convert $file_name into a regular expression that matches it
491 $file_name =~ s/\./\\\./g;
492 $file_name =~ s/\(/\\\(/g;
493 $file_name =~ s/\)/\\\)/g;
494 $file_name =~ s/\{/\\\{/g;
495 $file_name =~ s/\}/\\\}/g;
496 $file_name =~ s/\[/\\\[/g;
497 $file_name =~ s/\]/\\\]/g;
498
499 print $metadata_xml_file
500 "\n" .
501 " <FileSet>\n" .
502 " <FileName>$file_name</FileName>\n" .
503 " <Description>\n";
504
505 foreach my $pair (@$record_metadata) {
506 my ($field, $value) = (@$pair);
507
508 # We're only interested in metadata from the database
509 next if ($field eq "lastmodified");
510 next if ($field eq "gsdlsourcefilename");
511 next if ($field eq "gsdldoctype");
512 next if ($field eq "FileFormat");
513
514 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
515 next if ($field =~ /\^all$/); # ISISPlug specific!
516
517 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
518
519 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
520 $value =~ s/\[/&\#091;/g;
521 $value =~ s/\]/&\#093;/g;
522
523 # Make $value XML-safe
524 $value =~ s/&/&amp;/g; # May mess up existing entities!
525 $value =~ s/</&lt;/g;
526 $value =~ s/>/&gt;/g;
527
528 # we are not allowed & in xml except in entities.
529 # if there are undefined entities then parsing will also crap out.
530 # should we be checking for them too?
531 # this may not get all possibilities
532 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
533
534 # do we already have a namespace specified?
535 my $full_field = $field;
536 if ($meta_prefix ne "") {
537 $full_field =~ s/^\w+\.//;
538 $full_field = $meta_prefix.$full_field;
539 }
540
541 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
542 }
543
544 print $metadata_xml_file
545 " </Description>\n" .
546 " </FileSet>\n";
547}
548
549sub obtain_document
550{
551 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
552
553 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
554
555 my $document_file_name;
556 my $local_document_file;
557
558 # Document specified is on the web
559 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
560 $document_file_full =~ /([^\/]+)$/;
561 $document_file_name = $1;
562 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
563
564 my $wget_options = "--quiet";
565 $wget_options = "--verbose" if ($verbosity > 2);
566 $wget_options .= " --timestamping"; # Only re-download files if they're newer
567 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
568 `$wget_command`;
569
570 # Check the document was obtained successfully
571 if (!-e $local_document_file) {
572 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
573 }
574 }
575 # Document specified is on the disk
576 else {
577 # convert the dirseps in filepath to correct dir sep for OS
578 $document_file_full = &util::filename_cat($document_file_full);
579 my $dir_sep = &util::get_os_dirsep();
580
581 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
582 $document_file_name = $2;
583
584
585 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
586 print STDERR "doc file full = $document_file_full\n";
587
588 if (!$is_absolute) {
589 $document_file_full
590 = &util::filename_cat($orig_base_dir,$document_file_full);
591 }
592
593 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
594
595 if (-e $document_file_full) {
596 &util::cp($document_file_full, $documents_directory);
597 }
598
599 # Check the document was obtained successfully
600 if (!-e $local_document_file) {
601 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
602 }
603 else {
604 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
605 if ($document_file_full =~ m/^$orig_base_dir.*/) {
606 # file local to metadata record
607 # => copy has been made successfully, so remove original
608 &util::rm($document_file_full);
609 }
610 }
611 }
612
613 # If the document wasn't obtained successfully, create a .nul file for it
614 if (!-e $local_document_file) {
615 $document_file_name .= ".nul";
616 open(NULL_FILE, ">$local_document_file.nul");
617 close(NULL_FILE);
618 print STDERR "Creating a nul document $document_file_name\n";
619 }
620
621 return $document_file_name;
622}
623
624sub get_plugin_options {
625 my ($collectcfg, $plugin) = @_;
626
627 my $plugin_list = $collectcfg ->{'plugin'};
628
629 foreach my $pluginoptions (@$plugin_list) {
630 my $pluginname = shift @$pluginoptions;
631 next unless $pluginname eq $plugin;
632 map { $_ = "\"$_\""; } @$pluginoptions;
633 my $options = join (",", @$pluginoptions);
634 return $options;
635 }
636 return "";
637}
638
639&main(@ARGV);
640
Note: See TracBrowser for help on using the repository browser.