source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 31888

Last change on this file since 31888 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use extrametautil;
16use util;
17use FileUtils;
18use printusage;
19use parse2;
20use colcfg;
21
22use FileHandle;
23
24use File::Spec;
25use File::Basename;
26
27my $unicode_list =
28 [ { 'name' => "auto",
29 'desc' => "{ReadTextFile.input_encoding.auto}" },
30 { 'name' => "ascii",
31 'desc' => "{BasePlugin.encoding.ascii}" },
32 { 'name' => "utf8",
33 'desc' => "{BasePlugin.encoding.utf8}" },
34 { 'name' => "unicode",
35 'desc' => "{BasePlugin.encoding.unicode}" } ];
36
37my $e = $encodings::encodings;
38foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
39{
40 my $hashEncode =
41 {'name' => $enc,
42 'desc' => $e->{$enc}->{'name'}};
43
44 push(@{$unicode_list},$hashEncode);
45}
46
47my $arguments =
48 [
49 { 'name' => "language",
50 'desc' => "{scripts.language}",
51 'type' => "string",
52 'reqd' => "no",
53 'hiddengli' => "yes" },
54 { 'name' => "plugin",
55 'desc' => "{explode.plugin}",
56 'type' => "string",
57 'reqd' => "yes",
58 'hiddengli' => "yes"},
59 { 'name' => "input_encoding",
60 'desc' => "{explode.encoding}",
61 'type' => "enum",
62 'deft' => "auto",
63 'list' => $unicode_list,
64 'reqd' => "no" },
65 { 'name' => "metadata_set",
66 'desc' => "{explode.metadata_set}",
67 'type' => "string",
68 'reqd' => "no" },
69 { 'name' => "document_field",
70 'desc' => "{explode.document_field}",
71 'type' => "string",
72 'reqd' => "no"},
73 { 'name' => "document_prefix",
74 'desc' => "{explode.document_prefix}",
75 'type' => "string",
76 'reqd' => "no"},
77 { 'name' => "document_suffix",
78 'desc' => "{explode.document_suffix}",
79 'type' => "string",
80 'reqd' => "no"},
81 { 'name' => "records_per_folder",
82 'desc' => "{explode.records_per_folder}",
83 'type' => "int",
84 'range' => "0,",
85 'deft' => "100",
86 'reqd' => "no" },
87 { 'name' => "collectdir",
88 'desc' => "{import.collectdir}",
89 'type' => "string",
90 # parsearg left "" as default
91 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
92 'deft' => "",
93 'reqd' => "no",
94 'hiddengli' => "yes" },
95 { 'name' => "site",
96 'desc' => "{import.site}",
97 'type' => "string",
98 'deft' => "",
99 'reqd' => "no",
100 'hiddengli' => "yes" },
101 { 'name' => "collection",
102 'desc' => "{explode.collection}",
103 'type' => "string",
104 'reqd' => "no",
105 'hiddengli' => "yes"},
106 { 'name' => "use_collection_plugin_options",
107 'desc' => "{explode.use_collection_plugin_options}",
108 'type' => "flag",
109 'reqd' => "no",
110 'hiddengli' => "yes"},
111 { 'name' => "plugin_options",
112 'desc' => "{explode.plugin_options}",
113 'type' => "string",
114 'reqd' => "no",
115 'hiddengli' => "yes"},
116 { 'name' => "verbosity",
117 'desc' => "{import.verbosity}",
118 'type' => "int",
119 'range' => "0,",
120 'deft' => "1",
121 'reqd' => "no",
122 'modegli' => "3" },
123 { 'name' => "xml",
124 'desc' => "",
125 'type' => "flag",
126 'reqd' => "no",
127 'hiddengli' => "yes" }
128 ];
129
130my $options = { 'name' => "explode_metadata_database.pl",
131 'desc' => "{explode.desc}",
132 'args' => $arguments };
133
134
135
136sub main
137{
138 my ($language, $input_encoding, $metadata_set, $plugin,
139 $document_field, $document_prefix, $document_suffix,
140 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
141 $use_collection_plugin_options, $verbosity);
142
143 my $xml = 0;
144
145 my $hashParsingResult = {};
146 # parse the options
147 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
148
149 # If parse returns -1 then something has gone wrong
150 if ($intArgLeftinAfterParsing == -1)
151 {
152 &PrintUsage::print_txt_usage($options, "{explode.params}");
153 die "\n";
154 }
155
156 foreach my $strVariable (keys %$hashParsingResult)
157 {
158 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
159 }
160
161 # If $language has been specified, load the appropriate resource bundle
162 # (Otherwise, the default resource bundle will be loaded automatically)
163 if ($language && $language =~ /\S/) {
164 &gsprintf::load_language_specific_resource_bundle($language);
165 }
166
167 if ($xml) {
168 &PrintUsage::print_xml_usage($options);
169 print "\n";
170 return;
171 }
172
173
174 # There should one arg left after parsing (the filename)
175 # Or the user may have specified -h, in which case we output the usage
176 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
177 {
178 &PrintUsage::print_txt_usage($options, "{explode.params}");
179 die "\n";
180 }
181
182 # The metadata database filename is the first value that remains after the options have been parsed out
183 my $filename = $ARGV[0];
184 if (!defined $filename || $filename !~ /\w/) {
185 &PrintUsage::print_txt_usage($options, "{explode.params}");
186 print STDERR "You need to specify a filename";
187 die "\n";
188 }
189 # check that file exists
190 if (!-e $filename) {
191 print STDERR "File $filename doesn't exist...\n";
192 die "\n";
193 }
194 # check required options
195 if (!defined $plugin || $plugin !~ /\w/) {
196 &PrintUsage::print_txt_usage($options, "{explode.params}");
197 print STDERR "You need to specify a plugin";
198 die "\n";
199 }
200
201 # check metadata set
202 if (defined $metadata_set && $metadata_set =~ /\w/) {
203 $metadata_set .= ".";
204 } else {
205 $metadata_set = "";
206 }
207 if (defined $collection && $collection =~ /\w/) {
208 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
209 print STDERR "Collection $collection does not exist\n";
210 die "\n";
211 }
212 } else {
213 undef $collection;
214 }
215
216 if ($use_collection_plugin_options) {
217 if (defined $plugin_options && $plugin_options =~ /\w/) {
218 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
219 die "\n";
220 }
221 if (not defined $collection) {
222 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
223 die "\n";
224 }
225 }
226 my $plugobj;
227 require "$plugin.pm";
228
229 my $plugin_options_string = "";
230 if ($use_collection_plugin_options) {
231 # read in the collect.cfg file
232 # Read in the collection configuration file.
233 my $gs_mode = "gs2";
234 if ((defined $site) && ($site ne "")) { # GS3
235 $gs_mode = "gs3";
236 }
237 my $configfilename = &colcfg::get_collect_cfg_name(STDERR, $gs_mode);
238 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
239 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
240 }
241 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
242 my @options = split(/\s/, $plugin_options);
243 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
244 $plugin_options_string= join (",", @options);
245 }
246
247 if ($plugin_options_string eq "") {
248 eval ("\$plugobj = new $plugin()");
249 die "$@" if $@;
250 } else {
251 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
252 die "$@" if $@;
253 }
254
255 # ...and initialize it
256 $plugobj->init($verbosity, "STDERR", "STDERR");
257
258 if ($input_encoding eq "auto") {
259 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
260 }
261
262 # Create a directory to store the document files...
263 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
264
265 my $orig_base_dir = &File::Basename::dirname($filename);
266
267
268 my $split_exp = $plugobj->{'split_exp'};
269 if (defined $split_exp) {
270 # Read in file, and then split and process individual records
271
272 my $text = "";
273 # Use the plugin's read_file function to avoid duplicating code
274 $plugobj->read_file($filename, $input_encoding, undef, \$text);
275 # is there any text in the file??
276 die "\n" unless length($text);
277
278 # Split the text into records, using the plugin's split_exp
279
280 my @metadata_records = split(/$split_exp/, $text);
281 my $total_num_records = scalar(@metadata_records);
282 print STDERR "Number of records: $total_num_records\n";
283
284 # Write the metadata from each record to the metadata.xml file
285 my $record_number = 1;
286 my $documents_directory;
287 foreach my $record_text (@metadata_records) {
288
289 # Check if we need to start a new directory for these records
290 check_need_new_directory($exploded_base_dir,$record_number,
291 $records_per_folder,$total_num_records,
292 \$documents_directory);
293 # Use the plugin's process function to avoid duplicating code
294 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
295 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
296
297
298 # Try to get a doc to attach the metadata to
299 # If no match found, create a dummy .nul file
300 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
301 $documents_directory, $orig_base_dir,
302 $document_prefix, $document_suffix, $metadata_set, $verbosity);
303
304
305 check_close_directory($record_number,$records_per_folder,$total_num_records);
306
307 $record_number = $record_number + 1;
308 }
309 }
310 else {
311 # Call metadata_read to set up associated metadata
312
313 my $pluginfo = undef;
314 my $block_hash = {};
315
316 my $processor = undef;
317 my $maxdocs = undef;
318 my $gli = undef;
319
320 my $extrametakeys = [];
321 my $extrametadata = {};
322 my $extrametafile = {};
323
324 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
325 $extrametakeys, $extrametadata, $extrametafile,
326 $processor, $maxdocs, $gli);
327
328 my $total_num_records = scalar (@$extrametakeys);
329 print STDERR "Number of records: $total_num_records\n";
330 my $record_number = 1;
331 my $documents_directory;
332 foreach my $record (@$extrametakeys) {
333 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
334
335 # Attach metadata to object
336 # => use the plugin's extra_metadata function to avoid duplicating code
337 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
338 # all the metadata has been extracted into extrametadata
339 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), &extrametautil::getmetadata($extrametadata, $record));
340
341 # Try to get a doc to attach the metadata to
342 # If no match found, create a dummy .nul file
343 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
344
345 &check_close_directory($record_number,$records_per_folder,$total_num_records);
346
347 $record_number = $record_number + 1;
348
349 }
350 }
351
352 # Explode means just that: the original file is deleted
353 &FileUtils::removeFiles($filename);
354 $plugobj->clean_up_after_exploding();
355
356}
357
358
359sub need_new_directory
360{
361 my ($exploded_base_dir) = @_;
362
363 my $documents_directory = $exploded_base_dir;
364
365 if (-d $documents_directory) {
366 die "Error: document directory $documents_directory already exists (bailing).\n";
367 }
368 &FileUtils::makeDirectory($documents_directory);
369
370 my $documents_metadata_xml_file = &FileUtils::filenameConcatenate($documents_directory, "metadata.xml");
371 if (-e $documents_metadata_xml_file) {
372 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
373 }
374
375 # Start the metadata.xml file
376 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
377 binmode METADATA_XML_FILE, ":utf8";
378 print METADATA_XML_FILE
379 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
380 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
381 "<DirectoryMetadata>\n";
382
383 return $documents_directory;
384}
385
386sub check_need_new_directory
387{
388 my ($exploded_base_dir,$record_number, $records_per_folder,
389 $total_num_records, $documents_dir_ref) = @_;
390
391
392 # Check if we need to start a new directory for these records
393 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
394 my $documents_directory = $exploded_base_dir;
395
396 if ($total_num_records > $records_per_folder) {
397 $documents_directory .= "." . sprintf("%8.8d", $record_number);
398 }
399
400 $$documents_dir_ref = need_new_directory($documents_directory);
401 }
402}
403
404
405
406
407
408sub attach_metadata_or_make_nul_doc
409{
410 my ($document_field, $doc_obj, $record_number,
411 $documents_directory, $orig_base_dir,
412 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
413
414 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
415 my $document_file;
416
417 # try to get a doc to attach the metadata to
418 if (defined $document_field) {
419 foreach my $pair (@$record_metadata) {
420 my ($field, $value) = (@$pair);
421 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
422 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
423 my $document_file_full;
424
425 # Does this metadata element specify a document to obtain?
426 if ($field eq $document_field) {
427 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
428 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
429 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
430 $document_prefix =~ s/(\/|\\)$//;
431 $document_file_full = &FileUtils::filenameConcatenate($document_prefix, "$value$document_suffix");
432 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
433 $document_file_full = $document_prefix . $value . $document_suffix;
434 }
435
436 # this either downloads/copies the document, or creates a nul file for it.
437 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
438 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
439 }
440 }
441 }
442
443 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
444 if (not defined $document_file) {
445
446 if (defined ($record_number)) {
447 $document_file = sprintf("%8.8d", $record_number) . ".nul";
448 }
449 else {
450 $document_file = "doc.nul";
451 }
452 open(DUMMY_FILE, ">$documents_directory/$document_file");
453 close(DUMMY_FILE);
454 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
455 }
456
457}
458
459sub close_directory
460{
461 # Finish and close the metadata.xml file
462 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
463 close(METADATA_XML_FILE);
464
465}
466
467
468sub check_close_directory
469{
470 my ($record_number,$records_per_folder,$total_num_records) = @_;
471
472 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
473 # Finish and close the metadata.xml file
474 close_directory();
475 }
476}
477
478
479
480sub write_metadata_xml_file_entry
481{
482 my $metadata_xml_file = shift(@_);
483 my $file_name = shift(@_);
484 my $record_metadata = shift(@_);
485 my $meta_prefix = shift(@_);
486
487 # Make $file_name XML-safe
488 $file_name =~ s/&/&amp;/g;
489 $file_name =~ s/</&lt;/g;
490 $file_name =~ s/>/&gt;/g;
491
492 # Convert $file_name into a regular expression that matches it
493 $file_name =~ s/\./\\\./g;
494 $file_name =~ s/\(/\\\(/g;
495 $file_name =~ s/\)/\\\)/g;
496 $file_name =~ s/\{/\\\{/g;
497 $file_name =~ s/\}/\\\}/g;
498 $file_name =~ s/\[/\\\[/g;
499 $file_name =~ s/\]/\\\]/g;
500
501 print $metadata_xml_file
502 "\n" .
503 " <FileSet>\n" .
504 " <FileName>$file_name</FileName>\n" .
505 " <Description>\n";
506
507 foreach my $pair (@$record_metadata) {
508 my ($field, $value) = (@$pair);
509
510 # We're only interested in metadata from the database
511 next if ($field eq "lastmodified");
512 next if ($field eq "gsdlsourcefilename");
513 next if ($field eq "gsdldoctype");
514 next if ($field eq "FileFormat");
515
516 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
517 next if ($field =~ /\^all$/); # ISISPlug specific!
518
519 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
520
521 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
522 $value =~ s/\[/&\#091;/g;
523 $value =~ s/\]/&\#093;/g;
524
525 # Make $value XML-safe
526 $value =~ s/&/&amp;/g; # May mess up existing entities!
527 $value =~ s/</&lt;/g;
528 $value =~ s/>/&gt;/g;
529
530 # we are not allowed & in xml except in entities.
531 # if there are undefined entities then parsing will also crap out.
532 # should we be checking for them too?
533 # this may not get all possibilities
534 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
535
536 # do we already have a namespace specified?
537 my $full_field = $field;
538 if ($meta_prefix ne "") {
539 $full_field =~ s/^\w+\.//;
540 $full_field = $meta_prefix.$full_field;
541 }
542
543 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
544 }
545
546 print $metadata_xml_file
547 " </Description>\n" .
548 " </FileSet>\n";
549}
550
551sub obtain_document
552{
553 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
554
555 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
556
557 my $document_file_name;
558 my $local_document_file;
559
560 # Document specified is on the web
561 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
562 $document_file_full =~ /([^\/]+)$/;
563 $document_file_name = $1;
564 $local_document_file = &FileUtils::filenameConcatenate($documents_directory, $document_file_name);
565
566 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
567 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
568
569 my $wget_options = "--quiet";
570 $wget_options = "--verbose" if ($verbosity > 2);
571 $wget_options .= " --timestamping"; # Only re-download files if they're newer
572 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
573 `$wget_command`;
574
575 # Check the document was obtained successfully
576 if (!-e $local_document_file) {
577 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
578 }
579 }
580 # Document specified is on the disk
581 else {
582 # convert the dirseps in filepath to correct dir sep for OS
583 $document_file_full = &FileUtils::filenameConcatenate($document_file_full);
584 my $dir_sep = &util::get_os_dirsep();
585
586 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
587 $document_file_name = $2;
588
589
590 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
591 print STDERR "doc file full = $document_file_full\n";
592
593 if (!$is_absolute) {
594 $document_file_full
595 = &FileUtils::filenameConcatenate($orig_base_dir,$document_file_full);
596 }
597
598 $local_document_file = &FileUtils::filenameConcatenate($documents_directory, $document_file_name);
599
600 if (-e $document_file_full) {
601 &FileUtils::copyFiles($document_file_full, $documents_directory);
602 }
603
604 # Check the document was obtained successfully
605 if (!-e $local_document_file) {
606 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
607 }
608 else {
609 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
610 if ($document_file_full =~ m/^$orig_base_dir.*/) {
611 # file local to metadata record
612 # => copy has been made successfully, so remove original
613 &FileUtils::removeFiles($document_file_full);
614 }
615 }
616 }
617
618 # If the document wasn't obtained successfully, create a .nul file for it
619 if (!-e $local_document_file) {
620 $document_file_name .= ".nul";
621 open(NULL_FILE, ">$local_document_file.nul");
622 close(NULL_FILE);
623 print STDERR "Creating a nul document $document_file_name\n";
624 }
625
626 return $document_file_name;
627}
628
629sub get_plugin_options {
630 my ($collectcfg, $plugin) = @_;
631
632 my $plugin_list = $collectcfg ->{'plugin'};
633
634 foreach my $pluginoptions (@$plugin_list) {
635 my $pluginname = shift @$pluginoptions;
636 next unless $pluginname eq $plugin;
637 map { $_ = "\"$_\""; } @$pluginoptions;
638 my $options = join (",", @$pluginoptions);
639 return $options;
640 }
641 return "";
642}
643
644&main(@ARGV);
645
Note: See TracBrowser for help on using the repository browser.