source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 24829

Last change on this file since 24829 was 24829, checked in by ak19, 12 years ago

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.4 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use colcfg;
18
19use FileHandle;
20
21use File::Spec;
22use File::Basename;
23
24my $unicode_list =
25 [ { 'name' => "auto",
26 'desc' => "{ReadTextFile.input_encoding.auto}" },
27 { 'name' => "ascii",
28 'desc' => "{BasePlugin.encoding.ascii}" },
29 { 'name' => "utf8",
30 'desc' => "{BasePlugin.encoding.utf8}" },
31 { 'name' => "unicode",
32 'desc' => "{BasePlugin.encoding.unicode}" } ];
33
34my $e = $encodings::encodings;
35foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
36{
37 my $hashEncode =
38 {'name' => $enc,
39 'desc' => $e->{$enc}->{'name'}};
40
41 push(@{$unicode_list},$hashEncode);
42}
43
44my $arguments =
45 [
46 { 'name' => "language",
47 'desc' => "{scripts.language}",
48 'type' => "string",
49 'reqd' => "no",
50 'hiddengli' => "yes" },
51 { 'name' => "plugin",
52 'desc' => "{explode.plugin}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "input_encoding",
57 'desc' => "{explode.encoding}",
58 'type' => "enum",
59 'deft' => "auto",
60 'list' => $unicode_list,
61 'reqd' => "no" },
62 { 'name' => "metadata_set",
63 'desc' => "{explode.metadata_set}",
64 'type' => "string",
65 'reqd' => "no" },
66 { 'name' => "document_field",
67 'desc' => "{explode.document_field}",
68 'type' => "string",
69 'reqd' => "no"},
70 { 'name' => "document_prefix",
71 'desc' => "{explode.document_prefix}",
72 'type' => "string",
73 'reqd' => "no"},
74 { 'name' => "document_suffix",
75 'desc' => "{explode.document_suffix}",
76 'type' => "string",
77 'reqd' => "no"},
78 { 'name' => "records_per_folder",
79 'desc' => "{explode.records_per_folder}",
80 'type' => "int",
81 'range' => "0,",
82 'deft' => "100",
83 'reqd' => "no" },
84 { 'name' => "collectdir",
85 'desc' => "{import.collectdir}",
86 'type' => "string",
87 # parsearg left "" as default
88 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
89 'deft' => "",
90 'reqd' => "no",
91 'hiddengli' => "yes" },
92 { 'name' => "site",
93 'desc' => "{import.site}",
94 'type' => "string",
95 'deft' => "",
96 'reqd' => "no",
97 'hiddengli' => "yes" },
98 { 'name' => "collection",
99 'desc' => "{explode.collection}",
100 'type' => "string",
101 'reqd' => "no",
102 'hiddengli' => "yes"},
103 { 'name' => "use_collection_plugin_options",
104 'desc' => "{explode.use_collection_plugin_options}",
105 'type' => "flag",
106 'reqd' => "no",
107 'hiddengli' => "yes"},
108 { 'name' => "plugin_options",
109 'desc' => "{explode.plugin_options}",
110 'type' => "string",
111 'reqd' => "no",
112 'hiddengli' => "yes"},
113 { 'name' => "verbosity",
114 'desc' => "{import.verbosity}",
115 'type' => "int",
116 'range' => "0,",
117 'deft' => "1",
118 'reqd' => "no",
119 'modegli' => "3" },
120 { 'name' => "xml",
121 'desc' => "",
122 'type' => "flag",
123 'reqd' => "no",
124 'hiddengli' => "yes" }
125 ];
126
127my $options = { 'name' => "explode_metadata_database.pl",
128 'desc' => "{explode.desc}",
129 'args' => $arguments };
130
131
132
133sub main
134{
135 my ($language, $input_encoding, $metadata_set, $plugin,
136 $document_field, $document_prefix, $document_suffix,
137 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
138 $use_collection_plugin_options, $verbosity);
139
140 my $xml = 0;
141
142 my $hashParsingResult = {};
143 # parse the options
144 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
145
146 # If parse returns -1 then something has gone wrong
147 if ($intArgLeftinAfterParsing == -1)
148 {
149 &PrintUsage::print_txt_usage($options, "{explode.params}");
150 die "\n";
151 }
152
153 foreach my $strVariable (keys %$hashParsingResult)
154 {
155 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
156 }
157
158 # If $language has been specified, load the appropriate resource bundle
159 # (Otherwise, the default resource bundle will be loaded automatically)
160 if ($language && $language =~ /\S/) {
161 &gsprintf::load_language_specific_resource_bundle($language);
162 }
163
164 if ($xml) {
165 &PrintUsage::print_xml_usage($options);
166 print "\n";
167 return;
168 }
169
170
171 # There should one arg left after parsing (the filename)
172 # Or the user may have specified -h, in which case we output the usage
173 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
174 {
175 &PrintUsage::print_txt_usage($options, "{explode.params}");
176 die "\n";
177 }
178
179 # The metadata database filename is the first value that remains after the options have been parsed out
180 my $filename = $ARGV[0];
181 if (!defined $filename || $filename !~ /\w/) {
182 &PrintUsage::print_txt_usage($options, "{explode.params}");
183 print STDERR "You need to specify a filename";
184 die "\n";
185 }
186 # check that file exists
187 if (!-e $filename) {
188 print STDERR "File $filename doesn't exist...\n";
189 die "\n";
190 }
191 # check required options
192 if (!defined $plugin || $plugin !~ /\w/) {
193 &PrintUsage::print_txt_usage($options, "{explode.params}");
194 print STDERR "You need to specify a plugin";
195 die "\n";
196 }
197
198 # check metadata set
199 if (defined $metadata_set && $metadata_set =~ /\w/) {
200 $metadata_set .= ".";
201 } else {
202 $metadata_set = "";
203 }
204 if (defined $collection && $collection =~ /\w/) {
205 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
206 print STDERR "Collection $collection does not exist\n";
207 die "\n";
208 }
209 } else {
210 undef $collection;
211 }
212
213 if ($use_collection_plugin_options) {
214 if (defined $plugin_options && $plugin_options =~ /\w/) {
215 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
216 die "\n";
217 }
218 if (not defined $collection) {
219 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
220 die "\n";
221 }
222 }
223 my $plugobj;
224 require "$plugin.pm";
225
226 my $plugin_options_string = "";
227 if ($use_collection_plugin_options) {
228 # read in the collect.cfg file
229 # Read in the collection configuration file.
230 my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name(STDERR);
231 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
232 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
233 }
234 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
235 my @options = split(/\s/, $plugin_options);
236 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
237 $plugin_options_string= join (",", @options);
238 }
239
240 if ($plugin_options_string eq "") {
241 eval ("\$plugobj = new $plugin()");
242 die "$@" if $@;
243 } else {
244 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
245 die "$@" if $@;
246 }
247
248 # ...and initialize it
249 $plugobj->init($verbosity, "STDERR", "STDERR");
250
251 if ($input_encoding eq "auto") {
252 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
253 }
254
255 # Create a directory to store the document files...
256 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
257
258 my $orig_base_dir = &File::Basename::dirname($filename);
259
260
261 my $split_exp = $plugobj->{'split_exp'};
262 if (defined $split_exp) {
263 # Read in file, and then split and process individual records
264
265 my $text = "";
266 # Use the plugin's read_file function to avoid duplicating code
267 $plugobj->read_file($filename, $input_encoding, undef, \$text);
268 # is there any text in the file??
269 die "\n" unless length($text);
270
271 # Split the text into records, using the plugin's split_exp
272
273 my @metadata_records = split(/$split_exp/, $text);
274 my $total_num_records = scalar(@metadata_records);
275 print STDERR "Number of records: $total_num_records\n";
276
277 # Write the metadata from each record to the metadata.xml file
278 my $record_number = 1;
279 my $documents_directory;
280 foreach my $record_text (@metadata_records) {
281
282 # Check if we need to start a new directory for these records
283 check_need_new_directory($exploded_base_dir,$record_number,
284 $records_per_folder,$total_num_records,
285 \$documents_directory);
286 # Use the plugin's process function to avoid duplicating code
287 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
288 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
289
290
291 # Try to get a doc to attach the metadata to
292 # If no match found, create a dummy .nul file
293 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
294 $documents_directory, $orig_base_dir,
295 $document_prefix, $document_suffix, $metadata_set, $verbosity);
296
297
298 check_close_directory($record_number,$records_per_folder,$total_num_records);
299
300 $record_number = $record_number + 1;
301 }
302 }
303 else {
304 # Call metadata_read to set up associated metadata
305
306 my $pluginfo = undef;
307 my $block_hash = {};
308
309 my $processor = undef;
310 my $maxdocs = undef;
311 my $gli = undef;
312
313 my $extrametakeys = [];
314 my $extrametadata = {};
315 my $extrametafile = {};
316
317 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
318 $extrametakeys, $extrametadata, $extrametafile,
319 $processor, $maxdocs, $gli);
320
321 my $total_num_records = scalar (@$extrametakeys);
322 print STDERR "Number of records: $total_num_records\n";
323 my $record_number = 1;
324 my $documents_directory;
325 foreach my $record (@$extrametakeys) {
326 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
327
328 # Attach metadata to object
329 # => use the plugin's extra_metadata function to avoid duplicating code
330 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
331 # all the metadata has been extracted into extrametadata
332 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $extrametadata->{$record});
333
334 # Try to get a doc to attach the metadata to
335 # If no match found, create a dummy .nul file
336 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
337
338 &check_close_directory($record_number,$records_per_folder,$total_num_records);
339
340 $record_number = $record_number + 1;
341
342 }
343 }
344
345 # Explode means just that: the original file is deleted
346 &util::rm($filename);
347 $plugobj->clean_up_after_exploding();
348
349}
350
351
352sub need_new_directory
353{
354 my ($exploded_base_dir) = @_;
355
356 my $documents_directory = $exploded_base_dir;
357
358 if (-d $documents_directory) {
359 die "Error: document directory $documents_directory already exists (bailing).\n";
360 }
361 &util::mk_dir($documents_directory);
362
363 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
364 if (-e $documents_metadata_xml_file) {
365 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
366 }
367
368 # Start the metadata.xml file
369 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
370 print METADATA_XML_FILE
371 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
372 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
373 "<DirectoryMetadata>\n";
374
375 return $documents_directory;
376}
377
378sub check_need_new_directory
379{
380 my ($exploded_base_dir,$record_number, $records_per_folder,
381 $total_num_records, $documents_dir_ref) = @_;
382
383
384 # Check if we need to start a new directory for these records
385 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
386 my $documents_directory = $exploded_base_dir;
387
388 if ($total_num_records > $records_per_folder) {
389 $documents_directory .= "." . sprintf("%8.8d", $record_number);
390 }
391
392 $$documents_dir_ref = need_new_directory($documents_directory);
393 }
394}
395
396
397
398
399
400sub attach_metadata_or_make_nul_doc
401{
402 my ($document_field, $doc_obj, $record_number,
403 $documents_directory, $orig_base_dir,
404 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
405
406 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
407 my $document_file;
408
409 # try to get a doc to attach the metadata to
410 if (defined $document_field) {
411 foreach my $pair (@$record_metadata) {
412 my ($field, $value) = (@$pair);
413 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
414 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
415 my $document_file_full;
416
417 # Does this metadata element specify a document to obtain?
418 if ($field eq $document_field) {
419 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
420 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
421 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
422 $document_prefix =~ s/(\/|\\)$//;
423 $document_file_full = &util::filename_cat($document_prefix, "$value$document_suffix");
424 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
425 $document_file_full = $document_prefix . $value . $document_suffix;
426 }
427
428 # this either downloads/copies the document, or creates a nul file for it.
429 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
430 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
431 }
432 }
433 }
434
435 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
436 if (not defined $document_file) {
437
438 if (defined ($record_number)) {
439 $document_file = sprintf("%8.8d", $record_number) . ".nul";
440 }
441 else {
442 $document_file = "doc.nul";
443 }
444 open(DUMMY_FILE, ">$documents_directory/$document_file");
445 close(DUMMY_FILE);
446 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
447 }
448
449}
450
451sub close_directory
452{
453 # Finish and close the metadata.xml file
454 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
455 close(METADATA_XML_FILE);
456
457}
458
459
460sub check_close_directory
461{
462 my ($record_number,$records_per_folder,$total_num_records) = @_;
463
464 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
465 # Finish and close the metadata.xml file
466 close_directory();
467 }
468}
469
470
471
472sub write_metadata_xml_file_entry
473{
474 my $metadata_xml_file = shift(@_);
475 my $file_name = shift(@_);
476 my $record_metadata = shift(@_);
477 my $meta_prefix = shift(@_);
478
479 # Make $file_name XML-safe
480 $file_name =~ s/&/&amp;/g;
481 $file_name =~ s/</&lt;/g;
482 $file_name =~ s/>/&gt;/g;
483
484 # Convert $file_name into a regular expression that matches it
485 $file_name =~ s/\./\\\./g;
486 $file_name =~ s/\(/\\\(/g;
487 $file_name =~ s/\)/\\\)/g;
488 $file_name =~ s/\{/\\\{/g;
489 $file_name =~ s/\}/\\\}/g;
490 $file_name =~ s/\[/\\\[/g;
491 $file_name =~ s/\]/\\\]/g;
492
493 print $metadata_xml_file
494 "\n" .
495 " <FileSet>\n" .
496 " <FileName>$file_name</FileName>\n" .
497 " <Description>\n";
498
499 foreach my $pair (@$record_metadata) {
500 my ($field, $value) = (@$pair);
501
502 # We're only interested in metadata from the database
503 next if ($field eq "lastmodified");
504 next if ($field eq "gsdlsourcefilename");
505 next if ($field eq "gsdldoctype");
506 next if ($field eq "FileFormat");
507
508 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
509 next if ($field =~ /\^all$/); # ISISPlug specific!
510
511 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
512
513 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
514 $value =~ s/\[/&\#091;/g;
515 $value =~ s/\]/&\#093;/g;
516
517 # Make $value XML-safe
518 $value =~ s/&/&amp;/g; # May mess up existing entities!
519 $value =~ s/</&lt;/g;
520 $value =~ s/>/&gt;/g;
521
522 # we are not allowed & in xml except in entities.
523 # if there are undefined entities then parsing will also crap out.
524 # should we be checking for them too?
525 # this may not get all possibilities
526 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
527
528 # do we already have a namespace specified?
529 my $full_field = $field;
530 if ($meta_prefix ne "") {
531 $full_field =~ s/^\w+\.//;
532 $full_field = $meta_prefix.$full_field;
533 }
534
535 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
536 }
537
538 print $metadata_xml_file
539 " </Description>\n" .
540 " </FileSet>\n";
541}
542
543sub obtain_document
544{
545 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
546
547 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
548
549 my $document_file_name;
550 my $local_document_file;
551
552 # Document specified is on the web
553 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
554 $document_file_full =~ /([^\/]+)$/;
555 $document_file_name = $1;
556 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
557
558 my $wget_options = "--quiet";
559 $wget_options = "--verbose" if ($verbosity > 2);
560 $wget_options .= " --timestamping"; # Only re-download files if they're newer
561 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
562 `$wget_command`;
563
564 # Check the document was obtained successfully
565 if (!-e $local_document_file) {
566 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
567 }
568 }
569 # Document specified is on the disk
570 else {
571 # convert the dirseps in filepath to correct dir sep for OS
572 $document_file_full = &util::filename_cat($document_file_full);
573 my $dir_sep = &util::get_os_dirsep();
574
575 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
576 $document_file_name = $2;
577
578
579 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
580 print STDERR "doc file full = $document_file_full\n";
581
582 if (!$is_absolute) {
583 $document_file_full
584 = &util::filename_cat($orig_base_dir,$document_file_full);
585 }
586
587 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
588
589 if (-e $document_file_full) {
590 &util::cp($document_file_full, $documents_directory);
591 }
592
593 # Check the document was obtained successfully
594 if (!-e $local_document_file) {
595 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
596 }
597 else {
598 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
599 if ($document_file_full =~ m/^$orig_base_dir.*/) {
600 # file local to metadata record
601 # => copy has been made successfully, so remove original
602 &util::rm($document_file_full);
603 }
604 }
605 }
606
607 # If the document wasn't obtained successfully, create a .nul file for it
608 if (!-e $local_document_file) {
609 $document_file_name .= ".nul";
610 open(NULL_FILE, ">$local_document_file.nul");
611 close(NULL_FILE);
612 print STDERR "Creating a nul document $document_file_name\n";
613 }
614
615 return $document_file_name;
616}
617
618sub get_plugin_options {
619 my ($collectcfg, $plugin) = @_;
620
621 my $plugin_list = $collectcfg ->{'plugin'};
622
623 foreach my $pluginoptions (@$plugin_list) {
624 my $pluginname = shift @$pluginoptions;
625 next unless $pluginname eq $plugin;
626 map { $_ = "\"$_\""; } @$pluginoptions;
627 my $options = join (",", @$pluginoptions);
628 return $options;
629 }
630 return "";
631}
632
633&main(@ARGV);
634
Note: See TracBrowser for help on using the repository browser.