source: gsdl/trunk/bin/script/explode_metadata_database.pl@ 17598

Last change on this file since 17598 was 17598, checked in by kjdon, 15 years ago

when we are writing out metadata and adding on the namespace we first should check whether there is one already there or not. I have chosen to remove the existing namespace and add on the new one. is this right? also, am testing for namespace by presence of . (dot).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use colcfg;
18
19use FileHandle;
20
21use File::Spec;
22use File::Basename;
23
24my $unicode_list =
25 [ { 'name' => "auto",
26 'desc' => "{ReadTextFile.input_encoding.auto}" },
27 { 'name' => "ascii",
28 'desc' => "{BasePlugin.encoding.ascii}" },
29 { 'name' => "utf8",
30 'desc' => "{BasePlugin.encoding.utf8}" },
31 { 'name' => "unicode",
32 'desc' => "{BasePlugin.encoding.unicode}" } ];
33
34my $e = $encodings::encodings;
35foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
36{
37 my $hashEncode =
38 {'name' => $enc,
39 'desc' => $e->{$enc}->{'name'}};
40
41 push(@{$unicode_list},$hashEncode);
42}
43
44my $arguments =
45 [
46 { 'name' => "language",
47 'desc' => "{scripts.language}",
48 'type' => "string",
49 'reqd' => "no",
50 'hiddengli' => "yes" },
51 { 'name' => "plugin",
52 'desc' => "{explode.plugin}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "input_encoding",
57 'desc' => "{explode.encoding}",
58 'type' => "enum",
59 'deft' => "auto",
60 'list' => $unicode_list,
61 'reqd' => "no" },
62 { 'name' => "metadata_set",
63 'desc' => "{explode.metadata_set}",
64 'type' => "string",
65 'reqd' => "no" },
66 { 'name' => "document_field",
67 'desc' => "{explode.document_field}",
68 'type' => "string",
69 'reqd' => "no"},
70 { 'name' => "document_prefix",
71 'desc' => "{explode.document_prefix}",
72 'type' => "string",
73 'reqd' => "no"},
74 { 'name' => "document_suffix",
75 'desc' => "{explode.document_suffix}",
76 'type' => "string",
77 'reqd' => "no"},
78 { 'name' => "records_per_folder",
79 'desc' => "{explode.records_per_folder}",
80 'type' => "int",
81 'range' => "0,",
82 'deft' => "100",
83 'reqd' => "no" },
84 { 'name' => "plugin_options",
85 'desc' => "{explode.plugin_options}",
86 'type' => "string",
87 'reqd' => "no",
88 'modegli' => "3"},
89 { 'name' => "collection",
90 'desc' => "{explode.collection}",
91 'type' => "string",
92 'reqd' => "no",
93 'hiddengli' => "yes"},
94 { 'name' => "verbosity",
95 'desc' => "{import.verbosity}",
96 'type' => "int",
97 'range' => "0,",
98 'deft' => "1",
99 'reqd' => "no",
100 'modegli' => "4" },
101 { 'name' => "xml",
102 'desc' => "",
103 'type' => "flag",
104 'reqd' => "no",
105 'hiddengli' => "yes" }
106 ];
107
108my $options = { 'name' => "explode_metadata_database.pl",
109 'desc' => "{explode.desc}",
110 'args' => $arguments };
111
112
113
114sub main
115{
116 my ($language, $input_encoding, $metadata_set, $plugin,
117 $document_field, $document_prefix, $document_suffix, $records_per_folder, $plugin_options, $collection, $verbosity);
118
119 my $xml = 0;
120
121 my $hashParsingResult = {};
122 # parse the options
123 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
124
125 # If parse returns -1 then something has gone wrong
126 if ($intArgLeftinAfterParsing == -1)
127 {
128 &PrintUsage::print_txt_usage($options, "{explode.params}");
129 die "\n";
130 }
131
132 foreach my $strVariable (keys %$hashParsingResult)
133 {
134 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
135 }
136
137 # If $language has been specified, load the appropriate resource bundle
138 # (Otherwise, the default resource bundle will be loaded automatically)
139 if ($language && $language =~ /\S/) {
140 &gsprintf::load_language_specific_resource_bundle($language);
141 }
142
143 if ($xml) {
144 &PrintUsage::print_xml_usage($options);
145 print "\n";
146 return;
147 }
148
149
150 # There should one arg left after parsing (the filename)
151 # Or the user may have specified -h, in which case we output the usage
152 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
153 {
154 &PrintUsage::print_txt_usage($options, "{explode.params}");
155 die "\n";
156 }
157
158 # The metadata database filename is the first value that remains after the options have been parsed out
159 my $filename = $ARGV[0];
160 if (!defined $filename || $filename !~ /\w/) {
161 &PrintUsage::print_txt_usage($options, "{explode.params}");
162 print STDERR "You need to specify a filename";
163 die "\n";
164 }
165 # check that file exists
166 if (!-e $filename) {
167 print STDERR "File $filename doesn't exist...\n";
168 die "\n";
169 }
170 # check required options
171 if (!defined $plugin || $plugin !~ /\w/) {
172 &PrintUsage::print_txt_usage($options, "{explode.params}");
173 print STDERR "You need to specify a plugin";
174 die "\n";
175 }
176
177 # check metadata set
178 if (defined $metadata_set && $metadata_set =~ /\w/) {
179 $metadata_set .= ".";
180 } else {
181 $metadata_set = "";
182 }
183 if (defined $collection && $collection =~ /\w/) {
184 if (($collection = &colcfg::use_collection("", $collection, "")) eq "") {
185 print STDERR "Collection $collection does not exist\n";
186 die "\n";
187 }
188 }
189
190 my $plugobj;
191 require "$plugin.pm";
192
193 if (defined $plugin_options && $plugin_options =~ /\w/) {
194 my @options = split(/\s/, $plugin_options);
195 map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
196 $plugin_options = join (",", @options);
197 eval ("\$plugobj = new $plugin([], [$plugin_options])");
198 die "$@" if $@;
199 } else {
200 eval ("\$plugobj = new $plugin()");
201 die "$@" if $@;
202 }
203 # ...and initialize it
204 $plugobj->init($verbosity, "STDERR", "STDERR");
205
206 if ($input_encoding eq "auto") {
207 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
208 }
209
210 # Create a directory to store the document files...
211 my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
212
213 my $orig_base_dir = &File::Basename::dirname($filename);
214
215
216 my $split_exp = $plugobj->{'split_exp'};
217 if (defined $split_exp) {
218 # Read in file, and then split and process individual records
219
220 my $text = "";
221 # Use the plugin's read_file function to avoid duplicating code
222 $plugobj->read_file($filename, $input_encoding, undef, \$text);
223 # is there any text in the file??
224 die "\n" unless length($text);
225
226 # Split the text into records, using the plugin's split_exp
227
228 my @metadata_records = split(/$split_exp/, $text);
229 my $total_num_records = scalar(@metadata_records);
230 print STDERR "Number of records: $total_num_records\n";
231
232 # Write the metadata from each record to the metadata.xml file
233 my $record_number = 1;
234 my $documents_directory;
235 foreach my $record_text (@metadata_records) {
236
237 # Check if we need to start a new directory for these records
238 check_need_new_directory($exploded_base_dir,$record_number,
239 $records_per_folder,$total_num_records,
240 \$documents_directory);
241 # Use the plugin's process function to avoid duplicating code
242 my $doc_obj = new doc($filename, "nonindexed_doc");
243 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
244
245
246 # Try to get a doc to attach the metadata to
247 # If no match found, create a dummy .nul file
248 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
249 $documents_directory, $orig_base_dir,
250 $document_prefix, $document_suffix, $metadata_set, $verbosity);
251
252
253 check_close_directory($record_number,$records_per_folder,$total_num_records);
254
255 $record_number = $record_number + 1;
256 }
257 }
258 else {
259 # Call metadata_read to set up associated metadata
260
261 my $pluginfo = undef;
262 my $block_hash = {};
263
264 my $processor = undef;
265 my $maxdocs = undef;
266 my $gli = undef;
267
268 my $extrametakeys = [];
269 my $extrametadata = {};
270
271
272 $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,
273 $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
274
275 my $total_num_records = scalar (@$extrametakeys);
276 print STDERR "Number of records: $total_num_records\n";
277 my $record_number = 1;
278 my $documents_directory;
279 foreach my $record (@$extrametakeys) {
280 &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
281
282 # Attach metadata to object
283 # => use the plugin's extra_metadata function to avoid duplicating code
284 my $doc_obj = new doc($filename, "nonindexed_doc");
285 # all the metadata has been extracted into extrametadata
286 $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $extrametadata->{$record});
287
288 # Try to get a doc to attach the metadata to
289 # If no match found, create a dummy .nul file
290 attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
291
292 &check_close_directory($record_number,$records_per_folder,$total_num_records);
293
294 $record_number = $record_number + 1;
295
296 }
297 }
298
299 # Explode means just that: the original file is deleted
300 &util::rm($filename);
301 $plugobj->clean_up_after_exploding();
302
303}
304
305
306sub need_new_directory
307{
308 my ($exploded_base_dir) = @_;
309
310 my $documents_directory = $exploded_base_dir;
311
312 if (-d $documents_directory) {
313 die "Error: document directory $documents_directory already exists (bailing).\n";
314 }
315 &util::mk_dir($documents_directory);
316
317 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
318 if (-e $documents_metadata_xml_file) {
319 die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
320 }
321
322 # Start the metadata.xml file
323 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
324 print METADATA_XML_FILE
325 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
326 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
327 "<DirectoryMetadata>\n";
328
329 return $documents_directory;
330}
331
332sub check_need_new_directory
333{
334 my ($exploded_base_dir,$record_number, $records_per_folder,
335 $total_num_records, $documents_dir_ref) = @_;
336
337
338 # Check if we need to start a new directory for these records
339 if (($record_number % $records_per_folder) == 1) {
340 my $documents_directory = $exploded_base_dir;
341
342 if ($total_num_records > $records_per_folder) {
343 $documents_directory .= "." . sprintf("%8.8d", $record_number);
344 }
345
346 $$documents_dir_ref = need_new_directory($documents_directory);
347 }
348}
349
350
351
352
353
354sub attach_metadata_or_make_nul_doc
355{
356 my ($document_field, $doc_obj, $record_number,
357 $documents_directory, $orig_base_dir,
358 $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
359
360 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
361 my $document_file;
362
363 # try to get a doc to attach the metadata to
364 if (defined $document_field) {
365 foreach my $pair (@$record_metadata) {
366 my ($field, $value) = (@$pair);
367
368 $value =~ s/\\\\/\\/g;
369
370 # Does this metadata element specify a document to obtain?
371 if ($field eq $document_field) {
372 my $document_file_full = $document_prefix . $value . $document_suffix;
373
374 # this either downloads/copies the document, or creates a nul file for it.
375 $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
376 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
377 }
378 }
379 }
380
381 # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
382 if (not defined $document_file) {
383
384 if (defined ($record_number)) {
385 $document_file = sprintf("%8.8d", $record_number) . ".nul";
386 }
387 else {
388 $document_file = "doc.nul";
389 }
390 open(DUMMY_FILE, ">$documents_directory/$document_file");
391 close(DUMMY_FILE);
392 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
393 }
394
395}
396
397sub close_directory
398{
399 # Finish and close the metadata.xml file
400 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
401 close(METADATA_XML_FILE);
402
403}
404
405
406sub check_close_directory
407{
408 my ($record_number,$records_per_folder,$total_num_records) = @_;
409
410 if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
411 # Finish and close the metadata.xml file
412 close_directory();
413 }
414}
415
416
417
418sub write_metadata_xml_file_entry
419{
420 my $metadata_xml_file = shift(@_);
421 my $file_name = shift(@_);
422 my $record_metadata = shift(@_);
423 my $meta_prefix = shift(@_);
424
425 # Make $file_name XML-safe
426 $file_name =~ s/&/&amp;/g;
427 $file_name =~ s/</&lt;/g;
428 $file_name =~ s/>/&gt;/g;
429
430 # Convert $file_name into a regular expression that matches it
431 $file_name =~ s/\./\\\./g;
432 $file_name =~ s/\(/\\\(/g;
433 $file_name =~ s/\)/\\\)/g;
434 $file_name =~ s/\{/\\\{/g;
435 $file_name =~ s/\}/\\\}/g;
436 $file_name =~ s/\[/\\\[/g;
437 $file_name =~ s/\]/\\\]/g;
438
439 print $metadata_xml_file
440 "\n" .
441 " <FileSet>\n" .
442 " <FileName>$file_name</FileName>\n" .
443 " <Description>\n";
444
445 foreach my $pair (@$record_metadata) {
446 my ($field, $value) = (@$pair);
447
448 # We're only interested in metadata from the database
449 next if ($field eq "lastmodified");
450 next if ($field eq "gsdlsourcefilename");
451 next if ($field eq "gsdldoctype");
452 next if ($field eq "FileFormat");
453
454 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
455 next if ($field =~ /\^all$/); # ISISPlug specific!
456
457 # Make $value XML-safe
458 $value =~ s/&/&amp;/g; # May mess up existing entities!
459 $value =~ s/</&lt;/g;
460 $value =~ s/>/&gt;/g;
461
462 # we are not allowed & in xml except in entities.
463 # if there are undefined entities then parsing will also crap out.
464 # should we be checking for them too?
465 # this may not get all possibilities
466 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
467
468 # do we already have a namespace specified?
469 my $full_field = $field;
470 if ($meta_prefix ne "") {
471 $full_field =~ s/^\w+\.//;
472 $full_field = $meta_prefix.$full_field;
473 }
474
475 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
476 }
477
478 print $metadata_xml_file
479 " </Description>\n" .
480 " </FileSet>\n";
481}
482
483sub obtain_document
484{
485 my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
486
487 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
488
489 my $document_file_name;
490 my $local_document_file;
491
492 # Document specified is on the web
493 if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
494 $document_file_full =~ /([^\/]+)$/;
495 $document_file_name = $1;
496 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
497
498 my $wget_options = "--quiet";
499 $wget_options = "--verbose" if ($verbosity > 2);
500 $wget_options .= " --timestamping"; # Only re-download files if they're newer
501 my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
502 `$wget_command`;
503
504 # Check the document was obtained successfully
505 if (!-e $local_document_file) {
506 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
507 }
508 }
509 # Document specified is on the disk
510 else {
511 my $dir_sep = &util::get_os_dirsep();
512
513 $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
514 $document_file_name = $2;
515
516
517 my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
518 print STDERR "doc file full = $document_file_full\n";
519
520 if (!$is_absolute) {
521 $document_file_full
522 = &util::filename_cat($orig_base_dir,$document_file_full);
523 }
524
525 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
526
527 if (-e $document_file_full) {
528 &util::cp($document_file_full, $documents_directory);
529 }
530
531 # Check the document was obtained successfully
532 if (!-e $local_document_file) {
533 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
534 }
535 else {
536 $orig_base_dir =~ s/\\/\\\\/g; # escape windows style slashes for the regex below
537 if ($document_file_full =~ m/^$orig_base_dir.*/) {
538 # file local to metadata record
539 # => copy has been made successfully, so remove original
540 &util::rm($document_file_full);
541 }
542 }
543 }
544
545 # If the document wasn't obtained successfully, create a .nul file for it
546 if (!-e $local_document_file) {
547 $document_file_name .= ".nul";
548 open(NULL_FILE, ">$local_document_file.nul");
549 close(NULL_FILE);
550 print STDERR "Creating a nul document $document_file_name\n";
551 }
552
553 return $document_file_name;
554}
555
556&main(@ARGV);
557
Note: See TracBrowser for help on using the repository browser.