source: main/trunk/greenstone2/bin/script/explode_metadata_database.pl@ 24952

Last change on this file since 24952 was 24952, checked in by ak19, 12 years ago

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.4 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use extrametautil;
16use printusage;
17use parse2;
18use colcfg;
19
20use FileHandle;
21
22use File::Spec;
23use File::Basename;
24
25my $unicode_list =
26 [ { 'name' => "auto",
27 'desc' => "{ReadTextFile.input_encoding.auto}" },
28 { 'name' => "ascii",
29 'desc' => "{BasePlugin.encoding.ascii}" },
30 { 'name' => "utf8",
31 'desc' => "{BasePlugin.encoding.utf8}" },
32 { 'name' => "unicode",
33 'desc' => "{BasePlugin.encoding.unicode}" } ];
34
35my $e = $encodings::encodings;
36foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
37{
38 my $hashEncode =
39 {'name' => $enc,
40 'desc' => $e->{$enc}->{'name'}};
41
42 push(@{$unicode_list},$hashEncode);
43}
44
45my $arguments =
46 [
47 { 'name' => "language",
48 'desc' => "{scripts.language}",
49 'type' => "string",
50 'reqd' => "no",
51 'hiddengli' => "yes" },
52 { 'name' => "plugin",
53 'desc' => "{explode.plugin}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "input_encoding",
58 'desc' => "{explode.encoding}",
59 'type' => "enum",
60 'deft' => "auto",
61 'list' => $unicode_list,
62 'reqd' => "no" },
63 { 'name' => "metadata_set",
64 'desc' => "{explode.metadata_set}",
65 'type' => "string",
66 'reqd' => "no" },
67 { 'name' => "document_field",
68 'desc' => "{explode.document_field}",
69 'type' => "string",
70 'reqd' => "no"},
71 { 'name' => "document_prefix",
72 'desc' => "{explode.document_prefix}",
73 'type' => "string",
74 'reqd' => "no"},
75 { 'name' => "document_suffix",
76 'desc' => "{explode.document_suffix}",
77 'type' => "string",
78 'reqd' => "no"},
79 { 'name' => "records_per_folder",
80 'desc' => "{explode.records_per_folder}",
81 'type' => "int",
82 'range' => "0,",
83 'deft' => "100",
84 'reqd' => "no" },
85 { 'name' => "collectdir",
86 'desc' => "{import.collectdir}",
87 'type' => "string",
88 # parsearg left "" as default
89 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
90 'deft' => "",
91 'reqd' => "no",
92 'hiddengli' => "yes" },
93 { 'name' => "site",
94 'desc' => "{import.site}",
95 'type' => "string",
96 'deft' => "",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "collection",
100 'desc' => "{explode.collection}",
101 'type' => "string",
102 'reqd' => "no",
103 'hiddengli' => "yes"},
104 { 'name' => "use_collection_plugin_options",
105 'desc' => "{explode.use_collection_plugin_options}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes"},
109 { 'name' => "plugin_options",
110 'desc' => "{explode.plugin_options}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes"},
114 { 'name' => "verbosity",
115 'desc' => "{import.verbosity}",
116 'type' => "int",
117 'range' => "0,",
118 'deft' => "1",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "xml",
122 'desc' => "",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" }
126 ];
127
128my $options = { 'name' => "explode_metadata_database.pl",
129 'desc' => "{explode.desc}",
130 'args' => $arguments };
131
132
133
134sub main
135{
136 my ($language, $input_encoding, $metadata_set, $plugin,
137 $document_field, $document_prefix, $document_suffix,
138 $records_per_folder, $plugin_options, $collectdir, $site, $collection,
139 $use_collection_plugin_options, $verbosity);
140
141 my $xml = 0;
142
143 my $hashParsingResult = {};
144 # parse the options
145 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
146
147 # If parse returns -1 then something has gone wrong
148 if ($intArgLeftinAfterParsing == -1)
149 {
150 &PrintUsage::print_txt_usage($options, "{explode.params}");
151 die "\n";
152 }
153
154 foreach my $strVariable (keys %$hashParsingResult)
155 {
156 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
157 }
158
159 # If $language has been specified, load the appropriate resource bundle
160 # (Otherwise, the default resource bundle will be loaded automatically)
161 if ($language && $language =~ /\S/) {
162 &gsprintf::load_language_specific_resource_bundle($language);
163 }
164
165 if ($xml) {
166 &PrintUsage::print_xml_usage($options);
167 print "\n";
168 return;
169 }
170
171
172 # There should one arg left after parsing (the filename)
173 # Or the user may have specified -h, in which case we output the usage
174 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
175 {
176 &PrintUsage::print_txt_usage($options, "{explode.params}");
177 die "\n";
178 }
179
180 # The metadata database filename is the first value that remains after the options have been parsed out
181 my $filename = $ARGV[0];
182 if (!defined $filename || $filename !~ /\w/) {
183 &PrintUsage::print_txt_usage($options, "{explode.params}");
184 print STDERR "You need to specify a filename";
185 die "\n";
186 }
187 # check that file exists
188 if (!-e $filename) {
189 print STDERR "File $filename doesn't exist...\n";
190 die "\n";
191 }
192 # check required options
193 if (!defined $plugin || $plugin !~ /\w/) {
194 &PrintUsage::print_txt_usage($options, "{explode.params}");
195 print STDERR "You need to specify a plugin";
196 die "\n";
197 }
198
199 # check metadata set
200 if (defined $metadata_set && $metadata_set =~ /\w/) {
201 $metadata_set .= ".";
202 } else {
203 $metadata_set = "";
204 }
205 if (defined $collection && $collection =~ /\w/) {
206 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
207 print STDERR "Collection $collection does not exist\n";
208 die "\n";
209 }
210 } else {
211 undef $collection;
212 }
213
214 if ($use_collection_plugin_options) {
215 if (defined $plugin_options && $plugin_options =~ /\w/) {
216 print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
217 die "\n";
218 }
219 if (not defined $collection) {
220 print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
221 die "\n";
222 }
223 }
224 my $plugobj;
225 require "$plugin.pm";
226
227 my $plugin_options_string = "";
228 if ($use_collection_plugin_options) {
229 # read in the collect.cfg file
230 # Read in the collection configuration file.
231 my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name(STDERR);
232 my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
233 $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
234 }
235 elsif (defined $plugin_options && $plugin_options =~ /\w/) {
236 my @options = split(/\s/, $plugin_options);
237 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
238 $plugin_options_string= join (",", @options);
239 }
240
241 if ($plugin_options_string eq "") {
242 eval ("\$plugobj = new $plugin()");
243 die "$@" if $@;
244 } else {
245 eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
246 die "$@" if $@;
247 }
248
249 # ...and initialize it
250 $plugobj->init($verbosity, "STDERR", "STDERR");
251
252 if ($input_encoding eq "auto") {
253 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
254 }
255
256 # Create a directory to store the document files...
257 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
258
259 my $orig_base_dir = &File::Basename::dirname($filename);
260
261
262 my $split_exp = $plugobj->{'split_exp'};
263 if (defined $split_exp) {
264 # Read in file, and then split and process individual records
265
266 my $text = "";
267 # Use the plugin's read_file function to avoid duplicating code
268 $plugobj->read_file($filename, $input_encoding, undef, \$text);
269 # is there any text in the file??
270 die "\n" unless length($text);
271
272 # Split the text into records, using the plugin's split_exp
273
274 my @metadata_records = split(/$split_exp/, $text);
275 my $total_num_records = scalar(@metadata_records);
276 print STDERR "Number of records: $total_num_records\n";
277
278 # Write the metadata from each record to the metadata.xml file
279 my $record_number = 1;
280 my $documents_directory;
281 foreach my $record_text (@metadata_records) {
282
283 # Check if we need to start a new directory for these records
284 check_need_new_directory($exploded_base_dir,$record_number,
285 $records_per_folder,$total_num_records,
286 \$documents_directory);
287 # Use the plugin's process function to avoid duplicating code
288 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
289 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
290
291
292 # Try to get a doc to attach the metadata to
293 # If no match found, create a dummy .nul file
294 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
295 $documents_directory, $orig_base_dir,
296 $document_prefix, $document_suffix, $metadata_set, $verbosity);
297
298
299 check_close_directory($record_number,$records_per_folder,$total_num_records);
300
301 $record_number = $record_number + 1;
302 }
303 }
304 else {
305 # Call metadata_read to set up associated metadata
306
307 my $pluginfo = undef;
308 my $block_hash = {};
309
310 my $processor = undef;
311 my $maxdocs = undef;
312 my $gli = undef;
313
314 my $extrametakeys = [];
315 my $extrametadata = {};
316 my $extrametafile = {};
317
318 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
319 $extrametakeys, $extrametadata, $extrametafile,
320 $processor, $maxdocs, $gli);
321
322 my $total_num_records = scalar (@$extrametakeys);
323 print STDERR "Number of records: $total_num_records\n";
324 my $record_number = 1;
325 my $documents_directory;
326 foreach my $record (@$extrametakeys) {
327 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
328
329 # Attach metadata to object
330 # => use the plugin's extra_metadata function to avoid duplicating code
331 my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
332 # all the metadata has been extracted into extrametadata
333 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), &extrametautil::getmetadata($extrametadata, $record));
334
335 # Try to get a doc to attach the metadata to
336 # If no match found, create a dummy .nul file
337 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
338
339 &check_close_directory($record_number,$records_per_folder,$total_num_records);
340
341 $record_number = $record_number + 1;
342
343 }
344 }
345
346 # Explode means just that: the original file is deleted
347 &util::rm($filename);
348 $plugobj->clean_up_after_exploding();
349
350}
351
352
353sub need_new_directory
354{
355 my ($exploded_base_dir) = @_;
356
357 my $documents_directory = $exploded_base_dir;
358
359 if (-d $documents_directory) {
360 die "Error: document directory $documents_directory already exists (bailing).\n";
361 }
362 &util::mk_dir($documents_directory);
363
364 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
365 if (-e $documents_metadata_xml_file) {
366 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
367 }
368
369 # Start the metadata.xml file
370 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
371 print METADATA_XML_FILE
372 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
373 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
374 "<DirectoryMetadata>\n";
375
376 return $documents_directory;
377}
378
379sub check_need_new_directory
380{
381 my ($exploded_base_dir,$record_number, $records_per_folder,
382 $total_num_records, $documents_dir_ref) = @_;
383
384
385 # Check if we need to start a new directory for these records
386 if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
387 my $documents_directory = $exploded_base_dir;
388
389 if ($total_num_records > $records_per_folder) {
390 $documents_directory .= "." . sprintf("%8.8d", $record_number);
391 }
392
393 $$documents_dir_ref = need_new_directory($documents_directory);
394 }
395}
396
397
398
399
400
401sub attach_metadata_or_make_nul_doc
402{
403 my ($document_field, $doc_obj, $record_number,
404 $documents_directory, $orig_base_dir,
405 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
406
407 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
408 my $document_file;
409
410 # try to get a doc to attach the metadata to
411 if (defined $document_field) {
412 foreach my $pair (@$record_metadata) {
413 my ($field, $value) = (@$pair);
414 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
415 $value =~ s/\\\\/\\/g; # don't regex brackets () here though!
416 my $document_file_full;
417
418 # Does this metadata element specify a document to obtain?
419 if ($field eq $document_field) {
420 if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
421 # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
422 # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
423 $document_prefix =~ s/(\/|\\)$//;
424 $document_file_full = &util::filename_cat($document_prefix, "$value$document_suffix");
425 } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
426 $document_file_full = $document_prefix . $value . $document_suffix;
427 }
428
429 # this either downloads/copies the document, or creates a nul file for it.
430 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
431 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
432 }
433 }
434 }
435
436 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
437 if (not defined $document_file) {
438
439 if (defined ($record_number)) {
440 $document_file = sprintf("%8.8d", $record_number) . ".nul";
441 }
442 else {
443 $document_file = "doc.nul";
444 }
445 open(DUMMY_FILE, ">$documents_directory/$document_file");
446 close(DUMMY_FILE);
447 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
448 }
449
450}
451
452sub close_directory
453{
454 # Finish and close the metadata.xml file
455 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
456 close(METADATA_XML_FILE);
457
458}
459
460
461sub check_close_directory
462{
463 my ($record_number,$records_per_folder,$total_num_records) = @_;
464
465 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
466 # Finish and close the metadata.xml file
467 close_directory();
468 }
469}
470
471
472
473sub write_metadata_xml_file_entry
474{
475 my $metadata_xml_file = shift(@_);
476 my $file_name = shift(@_);
477 my $record_metadata = shift(@_);
478 my $meta_prefix = shift(@_);
479
480 # Make $file_name XML-safe
481 $file_name =~ s/&/&amp;/g;
482 $file_name =~ s/</&lt;/g;
483 $file_name =~ s/>/&gt;/g;
484
485 # Convert $file_name into a regular expression that matches it
486 $file_name =~ s/\./\\\./g;
487 $file_name =~ s/\(/\\\(/g;
488 $file_name =~ s/\)/\\\)/g;
489 $file_name =~ s/\{/\\\{/g;
490 $file_name =~ s/\}/\\\}/g;
491 $file_name =~ s/\[/\\\[/g;
492 $file_name =~ s/\]/\\\]/g;
493
494 print $metadata_xml_file
495 "\n" .
496 " <FileSet>\n" .
497 " <FileName>$file_name</FileName>\n" .
498 " <Description>\n";
499
500 foreach my $pair (@$record_metadata) {
501 my ($field, $value) = (@$pair);
502
503 # We're only interested in metadata from the database
504 next if ($field eq "lastmodified");
505 next if ($field eq "gsdlsourcefilename");
506 next if ($field eq "gsdldoctype");
507 next if ($field eq "FileFormat");
508
509 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
510 next if ($field =~ /\^all$/); # ISISPlug specific!
511
512 $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
513
514 # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
515 $value =~ s/\[/&\#091;/g;
516 $value =~ s/\]/&\#093;/g;
517
518 # Make $value XML-safe
519 $value =~ s/&/&amp;/g; # May mess up existing entities!
520 $value =~ s/</&lt;/g;
521 $value =~ s/>/&gt;/g;
522
523 # we are not allowed & in xml except in entities.
524 # if there are undefined entities then parsing will also crap out.
525 # should we be checking for them too?
526 # this may not get all possibilities
527 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
528
529 # do we already have a namespace specified?
530 my $full_field = $field;
531 if ($meta_prefix ne "") {
532 $full_field =~ s/^\w+\.//;
533 $full_field = $meta_prefix.$full_field;
534 }
535
536 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
537 }
538
539 print $metadata_xml_file
540 " </Description>\n" .
541 " </FileSet>\n";
542}
543
544sub obtain_document
545{
546 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
547
548 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
549
550 my $document_file_name;
551 my $local_document_file;
552
553 # Document specified is on the web
554 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
555 $document_file_full =~ /([^\/]+)$/;
556 $document_file_name = $1;
557 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
558
559 my $wget_options = "--quiet";
560 $wget_options = "--verbose" if ($verbosity > 2);
561 $wget_options .= " --timestamping"; # Only re-download files if they're newer
562 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
563 `$wget_command`;
564
565 # Check the document was obtained successfully
566 if (!-e $local_document_file) {
567 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
568 }
569 }
570 # Document specified is on the disk
571 else {
572 # convert the dirseps in filepath to correct dir sep for OS
573 $document_file_full = &util::filename_cat($document_file_full);
574 my $dir_sep = &util::get_os_dirsep();
575
576 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
577 $document_file_name = $2;
578
579
580 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
581 print STDERR "doc file full = $document_file_full\n";
582
583 if (!$is_absolute) {
584 $document_file_full
585 = &util::filename_cat($orig_base_dir,$document_file_full);
586 }
587
588 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
589
590 if (-e $document_file_full) {
591 &util::cp($document_file_full, $documents_directory);
592 }
593
594 # Check the document was obtained successfully
595 if (!-e $local_document_file) {
596 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
597 }
598 else {
599 $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below
600 if ($document_file_full =~ m/^$orig_base_dir.*/) {
601 # file local to metadata record
602 # => copy has been made successfully, so remove original
603 &util::rm($document_file_full);
604 }
605 }
606 }
607
608 # If the document wasn't obtained successfully, create a .nul file for it
609 if (!-e $local_document_file) {
610 $document_file_name .= ".nul";
611 open(NULL_FILE, ">$local_document_file.nul");
612 close(NULL_FILE);
613 print STDERR "Creating a nul document $document_file_name\n";
614 }
615
616 return $document_file_name;
617}
618
619sub get_plugin_options {
620 my ($collectcfg, $plugin) = @_;
621
622 my $plugin_list = $collectcfg ->{'plugin'};
623
624 foreach my $pluginoptions (@$plugin_list) {
625 my $pluginname = shift @$pluginoptions;
626 next unless $pluginname eq $plugin;
627 map { $_ = "\"$_\""; } @$pluginoptions;
628 my $options = join (",", @$pluginoptions);
629 return $options;
630 }
631 return "";
632}
633
634&main(@ARGV);
635
Note: See TracBrowser for help on using the repository browser.