root/main/trunk/greenstone2/bin/script/explode_metadata_database.pl @ 24829

Revision 24829, 19.4 KB (checked in by ak19, 8 years ago)

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use colcfg;
18
19use FileHandle;
20
21use File::Spec;
22use File::Basename;
23
24my $unicode_list =
25    [ { 'name' => "auto",
26    'desc' => "{ReadTextFile.input_encoding.auto}" },
27      { 'name' => "ascii",
28    'desc' => "{BasePlugin.encoding.ascii}" },
29      { 'name' => "utf8",
30    'desc' => "{BasePlugin.encoding.utf8}" },
31      { 'name' => "unicode",
32    'desc' => "{BasePlugin.encoding.unicode}" } ];
33
34my $e = $encodings::encodings;
35foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
36{
37    my $hashEncode =
38    {'name' => $enc,
39     'desc' => $e->{$enc}->{'name'}};
40   
41    push(@{$unicode_list},$hashEncode);
42}
43
44my $arguments =
45    [
46      { 'name' => "language",
47    'desc' => "{scripts.language}",
48    'type' => "string",
49    'reqd' => "no",
50        'hiddengli' => "yes" },
51      { 'name' => "plugin",
52    'desc' => "{explode.plugin}",
53    'type' => "string",
54    'reqd' => "yes",
55    'hiddengli' => "yes"},
56      { 'name' => "input_encoding",
57    'desc' => "{explode.encoding}",
58    'type' => "enum",
59    'deft' => "auto",
60    'list' => $unicode_list,
61    'reqd' => "no" },
62      { 'name' => "metadata_set",
63    'desc' => "{explode.metadata_set}",
64    'type' => "string",
65    'reqd' => "no" },
66      { 'name' => "document_field",
67    'desc' => "{explode.document_field}",
68    'type' => "string",
69    'reqd' => "no"},
70       { 'name' => "document_prefix",
71    'desc' => "{explode.document_prefix}",
72    'type' => "string",
73    'reqd' => "no"},
74      { 'name' => "document_suffix",
75    'desc' => "{explode.document_suffix}",
76    'type' => "string",
77    'reqd' => "no"},
78      { 'name' => "records_per_folder",
79    'desc' => "{explode.records_per_folder}",
80    'type' => "int",
81    'range' => "0,",
82    'deft' => "100",
83    'reqd' => "no" },
84       { 'name' => "collectdir",
85    'desc' => "{import.collectdir}",
86    'type' => "string",
87    # parsearg left "" as default
88    #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
89    'deft' => "",
90    'reqd' => "no",
91        'hiddengli' => "yes" },
92      { 'name' => "site",
93    'desc' => "{import.site}",
94    'type' => "string",
95    'deft' => "",
96    'reqd' => "no",
97        'hiddengli' => "yes" },
98      { 'name' => "collection",
99    'desc' => "{explode.collection}",
100    'type' => "string",
101    'reqd' => "no",
102    'hiddengli' => "yes"},
103      { 'name' => "use_collection_plugin_options",
104    'desc' => "{explode.use_collection_plugin_options}",
105    'type' => "flag",
106    'reqd' => "no",
107    'hiddengli' => "yes"},
108      { 'name' => "plugin_options",
109    'desc' => "{explode.plugin_options}",
110    'type' => "string",
111    'reqd' => "no",
112    'hiddengli' => "yes"},
113      { 'name' => "verbosity",
114    'desc' => "{import.verbosity}",
115    'type' => "int",
116    'range' => "0,",
117    'deft' => "1",
118    'reqd' => "no",
119    'modegli' => "3" },
120      { 'name' => "xml",
121    'desc' => "",
122    'type' => "flag",
123    'reqd' => "no",
124    'hiddengli' => "yes" }
125      ];
126   
127my $options = { 'name' => "explode_metadata_database.pl",
128        'desc' => "{explode.desc}",
129        'args' => $arguments };
130
131
132
133sub main
134{
135    my ($language, $input_encoding, $metadata_set, $plugin,
136    $document_field, $document_prefix, $document_suffix,
137    $records_per_folder, $plugin_options, $collectdir, $site, $collection,
138    $use_collection_plugin_options, $verbosity);
139
140    my $xml = 0;
141
142    my $hashParsingResult = {};
143    # parse the options
144    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
145
146    # If parse returns -1 then something has gone wrong
147    if ($intArgLeftinAfterParsing == -1)
148    {
149    &PrintUsage::print_txt_usage($options, "{explode.params}");
150    die "\n";
151    }
152
153    foreach my $strVariable (keys %$hashParsingResult)
154    {
155    eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
156    }
157
158    # If $language has been specified, load the appropriate resource bundle
159    # (Otherwise, the default resource bundle will be loaded automatically)
160    if ($language && $language =~ /\S/) {
161    &gsprintf::load_language_specific_resource_bundle($language);
162    }
163
164    if ($xml) {
165        &PrintUsage::print_xml_usage($options);
166    print "\n";
167    return;
168    }
169
170   
171    # There should one arg left after parsing (the filename)
172    # Or the user may have specified -h, in which case we output the usage
173    if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
174    {
175    &PrintUsage::print_txt_usage($options, "{explode.params}");
176    die "\n";
177    }
178
179    # The metadata database filename is the first value that remains after the options have been parsed out
180    my $filename = $ARGV[0];
181    if (!defined $filename || $filename !~ /\w/) {
182    &PrintUsage::print_txt_usage($options, "{explode.params}");
183    print STDERR "You need to specify a filename";
184    die "\n";
185    }
186    # check that file exists
187    if (!-e $filename) {
188    print STDERR "File $filename doesn't exist...\n";
189    die "\n";
190    }
191    # check required options
192    if (!defined $plugin || $plugin !~ /\w/) {
193    &PrintUsage::print_txt_usage($options, "{explode.params}");
194    print STDERR "You need to specify a plugin";
195    die "\n";
196    }
197   
198    # check metadata set
199    if (defined $metadata_set && $metadata_set =~ /\w/) {
200    $metadata_set .= ".";
201    } else {
202    $metadata_set = "";
203    }
204    if (defined $collection && $collection =~ /\w/) {
205    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
206        print STDERR "Collection $collection does not exist\n";
207        die "\n";
208    }
209    } else {
210    undef $collection;
211    }
212   
213    if ($use_collection_plugin_options) {
214    if (defined $plugin_options && $plugin_options =~ /\w/) {
215        print STDERR "Error: you cannot have -use_collection_plugin_options and -plugin_options set at the same time\n";
216        die "\n";
217    }
218    if (not defined $collection) {
219        print STDERR "Error: you must specify a collection using -collection to use -use_collection_plugin_options\n";
220        die "\n";
221    }
222    }
223    my $plugobj;
224    require "$plugin.pm";
225
226    my $plugin_options_string = "";
227    if ($use_collection_plugin_options) {
228    # read in the collect.cfg file
229    # Read in the collection configuration file.
230    my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name(STDERR);
231    my $collectcfg = &colcfg::read_collect_cfg ($configfilename, $gs_mode);
232    $plugin_options_string = &get_plugin_options($collectcfg, $plugin);
233    }
234    elsif (defined $plugin_options && $plugin_options =~ /\w/) {
235    my @options = split(/\s/, $plugin_options);
236    map { $_ = "\"$_\"" unless $_ =~ /^\"/; } @options;
237    $plugin_options_string= join (",", @options);
238    }
239
240    if ($plugin_options_string eq "") {
241    eval ("\$plugobj = new $plugin()");
242    die "$@" if $@;
243    } else {
244    eval ("\$plugobj = new $plugin([], [$plugin_options_string])");
245    die "$@" if $@;
246    }
247   
248    # ...and initialize it
249    $plugobj->init($verbosity, "STDERR", "STDERR");
250
251    if ($input_encoding eq "auto") {
252    ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
253    }       
254
255    # Create a directory to store the document files...
256    my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
257
258    my $orig_base_dir = &File::Basename::dirname($filename);
259
260
261    my $split_exp = $plugobj->{'split_exp'};
262    if (defined $split_exp) {
263    # Read in file, and then split and process individual records
264
265    my $text = "";
266    # Use the plugin's read_file function to avoid duplicating code
267    $plugobj->read_file($filename, $input_encoding, undef, \$text);
268    # is there any text in the file??
269    die "\n" unless length($text);
270
271    # Split the text into records, using the plugin's split_exp
272
273    my @metadata_records = split(/$split_exp/, $text);
274    my $total_num_records = scalar(@metadata_records);
275    print STDERR "Number of records: $total_num_records\n";
276   
277    # Write the metadata from each record to the metadata.xml file
278    my $record_number = 1;
279    my $documents_directory;
280    foreach my $record_text (@metadata_records) {
281       
282        # Check if we need to start a new directory for these records
283        check_need_new_directory($exploded_base_dir,$record_number,
284                     $records_per_folder,$total_num_records,
285                     \$documents_directory);
286        # Use the plugin's process function to avoid duplicating code
287        my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
288        $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
289       
290       
291        # Try to get a doc to attach the metadata to
292        # If no match found, create a dummy .nul file
293        attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number,
294                       $documents_directory, $orig_base_dir,
295                       $document_prefix, $document_suffix, $metadata_set, $verbosity);
296               
297       
298        check_close_directory($record_number,$records_per_folder,$total_num_records);
299       
300        $record_number = $record_number + 1;
301    }
302    }
303    else {
304    # Call metadata_read to set up associated metadata
305
306    my $pluginfo = undef;
307    my $block_hash = {};
308
309    my $processor = undef;
310    my $maxdocs = undef;
311    my $gli = undef;
312
313    my $extrametakeys = [];
314    my $extrametadata = {};
315    my $extrametafile = {};
316
317    $plugobj->metadata_read($pluginfo, "", $filename, $block_hash,   
318                $extrametakeys, $extrametadata, $extrametafile,
319                $processor, $maxdocs, $gli);
320
321    my $total_num_records = scalar (@$extrametakeys);
322    print STDERR "Number of records: $total_num_records\n";
323    my $record_number = 1;
324    my $documents_directory;
325    foreach my $record (@$extrametakeys) {
326        &check_need_new_directory($exploded_base_dir, $record_number, $records_per_folder, $total_num_records, \$documents_directory);
327       
328        # Attach metadata to object
329        # => use the plugin's extra_metadata function to avoid duplicating code
330        my $doc_obj = new doc($filename, "nonindexed_doc", $plugobj->get_file_rename_method());
331        # all the metadata has been extracted into extrametadata
332        $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $extrametadata->{$record});
333
334        # Try to get a doc to attach the metadata to
335        # If no match found, create a dummy .nul file
336        attach_metadata_or_make_nul_doc($document_field, $doc_obj, $record_number, $documents_directory, $orig_base_dir, $document_prefix, $document_suffix, $metadata_set, $verbosity);
337       
338        &check_close_directory($record_number,$records_per_folder,$total_num_records);
339       
340        $record_number = $record_number + 1;
341
342    }
343    }
344
345    # Explode means just that: the original file is deleted
346    &util::rm($filename);
347    $plugobj->clean_up_after_exploding();
348
349}
350
351
352sub need_new_directory
353{
354    my ($exploded_base_dir) = @_;
355   
356    my $documents_directory = $exploded_base_dir;
357
358    if (-d $documents_directory) {
359    die "Error: document directory $documents_directory already exists (bailing).\n";
360    }
361    &util::mk_dir($documents_directory);
362
363    my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
364    if (-e $documents_metadata_xml_file) {
365    die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
366    }
367
368    # Start the metadata.xml file
369    open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
370    print METADATA_XML_FILE
371    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
372    "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
373    "<DirectoryMetadata>\n";
374
375    return $documents_directory;
376}
377
378sub check_need_new_directory
379{
380    my ($exploded_base_dir,$record_number, $records_per_folder,
381    $total_num_records, $documents_dir_ref) = @_;
382   
383
384    # Check if we need to start a new directory for these records
385    if ($records_per_folder == 1 || ($record_number % $records_per_folder) == 1) {
386    my $documents_directory = $exploded_base_dir;
387
388    if ($total_num_records > $records_per_folder) {
389        $documents_directory .= "." . sprintf("%8.8d", $record_number);
390    }
391
392    $$documents_dir_ref = need_new_directory($documents_directory);
393    }
394}
395
396
397
398
399
400sub attach_metadata_or_make_nul_doc
401{
402    my ($document_field, $doc_obj, $record_number,
403    $documents_directory, $orig_base_dir,
404    $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
405
406    my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
407    my $document_file;
408
409    # try to get a doc to attach the metadata to
410    if (defined $document_field) {
411    foreach my $pair (@$record_metadata) {
412        my ($field, $value) = (@$pair);
413        $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
414        $value =~ s/\\\\/\\/g;         # don't regex brackets () here though!
415        my $document_file_full;
416
417        # Does this metadata element specify a document to obtain?
418        if ($field eq $document_field) {
419        if(-d $document_prefix && $document_prefix !~ m@^(http|ftp|https)://@ ) {
420            # if the document-prefix refers to a directory but not URL, ensure it has a file-separator at the end
421            # by first of all stripping any trailing slash and then always ensuring one is used through filename_cat
422            $document_prefix =~ s/(\/|\\)$//;
423            $document_file_full = &util::filename_cat($document_prefix, "$value$document_suffix");
424        } else { # the doc prefix may also contain the prefix of the actual *filename* following the directory
425            $document_file_full = $document_prefix . $value . $document_suffix;
426        }
427
428        # this either downloads/copies the document, or creates a nul file for it.
429        $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
430        &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
431        }
432    }
433    }
434   
435    # Create a dummy .nul file if we haven't obtained a document (or null file) for this record
436    if (not defined $document_file) {
437
438    if (defined ($record_number)) {
439        $document_file = sprintf("%8.8d", $record_number) . ".nul";
440    }
441    else {
442        $document_file = "doc.nul";
443    }
444    open(DUMMY_FILE, ">$documents_directory/$document_file");
445    close(DUMMY_FILE);
446    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
447    }
448
449}
450
451sub close_directory
452{
453    # Finish and close the metadata.xml file
454    print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
455    close(METADATA_XML_FILE);
456
457}
458
459
460sub check_close_directory
461{
462    my ($record_number,$records_per_folder,$total_num_records) = @_;
463
464    if (($record_number % $records_per_folder) == 0 || $record_number == $total_num_records) {
465    # Finish and close the metadata.xml file
466    close_directory();
467    }
468}
469       
470
471
472sub write_metadata_xml_file_entry
473{
474    my $metadata_xml_file = shift(@_);
475    my $file_name = shift(@_);
476    my $record_metadata = shift(@_);
477    my $meta_prefix = shift(@_);
478   
479    # Make $file_name XML-safe
480    $file_name =~ s/&/&amp;/g;
481    $file_name =~ s/</&lt;/g;
482    $file_name =~ s/>/&gt;/g;
483
484    # Convert $file_name into a regular expression that matches it
485    $file_name =~ s/\./\\\./g;
486    $file_name =~ s/\(/\\\(/g;
487    $file_name =~ s/\)/\\\)/g;
488    $file_name =~ s/\{/\\\{/g;
489    $file_name =~ s/\}/\\\}/g;
490    $file_name =~ s/\[/\\\[/g;
491    $file_name =~ s/\]/\\\]/g;
492   
493    print $metadata_xml_file
494    "\n" .
495        "  <FileSet>\n" .
496    "    <FileName>$file_name</FileName>\n" .
497    "    <Description>\n";
498
499    foreach my $pair (@$record_metadata) {
500    my ($field, $value) = (@$pair);
501
502    # We're only interested in metadata from the database
503    next if ($field eq "lastmodified");
504    next if ($field eq "gsdlsourcefilename");
505    next if ($field eq "gsdldoctype");
506    next if ($field eq "FileFormat");
507
508    # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
509    next if ($field =~ /\^all$/);  # ISISPlug specific!
510
511    $field =~ s/^ex\.([^.]+)$/$1/; #remove any ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
512
513    # Square brackets in metadata values need to be escaped so they don't confuse Greenstone/GLI
514    $value =~ s/\[/&\#091;/g;
515    $value =~ s/\]/&\#093;/g;
516
517    # Make $value XML-safe
518    $value =~ s/&/&amp;/g;  # May mess up existing entities!
519    $value =~ s/</&lt;/g;
520    $value =~ s/>/&gt;/g;
521
522    # we are not allowed & in xml except in entities.
523    # if there are undefined entities then parsing will also crap out.
524    # should we be checking for them too?
525    # this may not get all possibilities
526    # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
527
528    # do we already have a namespace specified?
529    my $full_field = $field;
530    if ($meta_prefix ne "") {
531        $full_field =~ s/^\w+\.//;
532        $full_field = $meta_prefix.$full_field;
533    }
534
535    print $metadata_xml_file "      <Metadata mode=\"accumulate\" name=\"$full_field\">$value</Metadata>\n";
536    }
537
538    print $metadata_xml_file
539    "    </Description>\n" .
540        "  </FileSet>\n";
541}
542
543sub obtain_document
544{
545    my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
546   
547    print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
548
549    my $document_file_name;
550    my $local_document_file;
551
552    # Document specified is on the web
553    if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
554    $document_file_full =~ /([^\/]+)$/;
555    $document_file_name = $1;
556    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
557
558    my $wget_options = "--quiet";
559    $wget_options = "--verbose" if ($verbosity > 2);
560    $wget_options .= " --timestamping";  # Only re-download files if they're newer
561    my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
562    `$wget_command`;
563
564    # Check the document was obtained successfully
565    if (!-e $local_document_file) {
566        print STDERR "WARNING: Could not obtain document file $document_file_full\n";
567    }
568    }
569    # Document specified is on the disk
570    else {
571    # convert the dirseps in filepath to correct dir sep for OS
572    $document_file_full = &util::filename_cat($document_file_full);
573    my $dir_sep = &util::get_os_dirsep();
574
575    $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
576    $document_file_name = $2;
577
578
579    my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
580    print STDERR "doc file full = $document_file_full\n";
581
582    if (!$is_absolute) {
583        $document_file_full
584        = &util::filename_cat($orig_base_dir,$document_file_full);
585    }
586
587    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
588
589    if (-e $document_file_full) {
590        &util::cp($document_file_full, $documents_directory);
591    }
592   
593    # Check the document was obtained successfully
594    if (!-e $local_document_file) {
595        print STDERR "WARNING: Could not obtain document file $document_file_full\n";
596    }
597    else {
598        $orig_base_dir = &util::filename_to_regex($orig_base_dir); # escape windows style slashes for the regex below       
599        if ($document_file_full =~ m/^$orig_base_dir.*/) {
600        # file local to metadata record
601        # => copy has been made successfully, so remove original
602        &util::rm($document_file_full);
603        }
604    }
605    }
606
607    # If the document wasn't obtained successfully, create a .nul file for it
608    if (!-e $local_document_file) {
609    $document_file_name .= ".nul";
610    open(NULL_FILE, ">$local_document_file.nul");
611    close(NULL_FILE);
612    print STDERR "Creating a nul document $document_file_name\n";
613    }
614
615    return $document_file_name;
616}
617
618sub get_plugin_options {
619    my ($collectcfg, $plugin)  = @_;
620   
621    my $plugin_list = $collectcfg ->{'plugin'};
622   
623    foreach my $pluginoptions (@$plugin_list) {
624    my $pluginname = shift @$pluginoptions;
625    next unless $pluginname eq $plugin;
626    map { $_ = "\"$_\""; } @$pluginoptions;
627    my $options = join (",", @$pluginoptions);
628    return $options;
629    }
630    return "";
631}
632
633&main(@ARGV);
634
Note: See TracBrowser for help on using the browser.