root/gsdl/trunk/bin/script/explode_metadata_database.pl @ 15074

Revision 15074, 11.7 KB (checked in by ak19, 12 years ago)

added -w in #!/usr/bin/perl -w

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use FileHandle;
18
19my $unicode_list =
20    [ { 'name' => "auto",
21    'desc' => "{BasPlug.input_encoding.auto}" },
22      { 'name' => "ascii",
23    'desc' => "{BasPlug.input_encoding.ascii}" },
24      { 'name' => "utf8",
25    'desc' => "{BasPlug.input_encoding.utf8}" },
26      { 'name' => "unicode",
27    'desc' => "{BasPlug.input_encoding.unicode}" } ];
28
29my $e = $encodings::encodings;
30foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
31{
32    my $hashEncode =
33    {'name' => $enc,
34     'desc' => $e->{$enc}->{'name'}};
35   
36    push(@{$unicode_list},$hashEncode);
37}
38
39my $arguments =
40    [
41      { 'name' => "language",
42    'desc' => "{scripts.language}",
43    'type' => "string",
44    'reqd' => "no",
45        'hiddengli' => "yes" },
46      { 'name' => "plugin",
47    'desc' => "{explode.plugin}",
48    'type' => "string",
49    'reqd' => "yes",
50    'hiddengli' => "yes"},
51      { 'name' => "input_encoding",
52    'desc' => "{explode.encoding}",
53    'type' => "enum",
54    'deft' => "auto",
55    'list' => $unicode_list,
56    'reqd' => "no" },
57      { 'name' => "metadata_set",
58    'desc' => "{explode.metadata_set}",
59    'type' => "string",
60    'reqd' => "no" },
61      { 'name' => "document_field",
62    'desc' => "{explode.document_field}",
63    'type' => "string",
64    'reqd' => "no"},
65       { 'name' => "document_prefix",
66    'desc' => "{explode.document_prefix}",
67    'type' => "string",
68    'reqd' => "no"},
69      { 'name' => "document_suffix",
70    'desc' => "{explode.document_suffix}",
71    'type' => "string",
72    'reqd' => "no"},
73      { 'name' => "records_per_folder",
74    'desc' => "{explode.records_per_folder}",
75    'type' => "int",
76    'range' => "0,",
77    'deft' => "100",
78    'reqd' => "no" },
79      { 'name' => "verbosity",
80    'desc' => "{import.verbosity}",
81    'type' => "int",
82    'range' => "0,",
83    'deft' => "1",
84    'reqd' => "no",
85    'modegli' => "4" },
86      { 'name' => "xml",
87    'desc' => "",
88    'type' => "flag",
89    'reqd' => "no",
90    'hiddengli' => "yes" }
91      ];
92   
93my $options = { 'name' => "explode_metadata_database.pl",
94        'desc' => "{explode.desc}",
95        'args' => $arguments };
96
97       
98sub main
99{
100    my ($language, $input_encoding, $metadata_set, $plugin,
101    $document_field, $document_prefix, $document_suffix, $records_per_folder, $verbosity);
102
103    my $xml = 0;
104
105    my $hashParsingResult = {};
106    # parse the options
107    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
108
109    # If parse returns -1 then something has gone wrong
110    if ($intArgLeftinAfterParsing == -1)
111    {
112    &PrintUsage::print_txt_usage($options, "{explode.params}");
113    die "\n";
114    }
115
116    foreach my $strVariable (keys %$hashParsingResult)
117    {
118    eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
119    }
120
121    # If $language has been specified, load the appropriate resource bundle
122    # (Otherwise, the default resource bundle will be loaded automatically)
123    if ($language && $language =~ /\S/) {
124    &gsprintf::load_language_specific_resource_bundle($language);
125    }
126
127    if ($xml) {
128        &PrintUsage::print_xml_usage($options);
129    print "\n";
130    return;
131    }
132
133    # There should one arg left after parsing (the filename)
134    # Or the user may have specified -h, in which case we output the usage
135    if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
136    {
137    &PrintUsage::print_txt_usage($options, "{explode.params}");
138    die "\n";
139    }
140
141    # The metadata database filename is the first value that remains after the options have been parsed out
142    my $filename = $ARGV[0];
143    if (!defined $filename || $filename !~ /\w/) {
144    &PrintUsage::print_txt_usage($options, "{explode.params}");
145    print STDERR "You need to specify a filename";
146    die "\n";
147    }
148    # check that file exists
149    if (!-e $filename) {
150    print STDERR "File $filename doesn't exist...\n";
151    die "\n";
152    }
153    # check required options
154    if (!defined $plugin || $plugin !~ /\w/) {
155    &PrintUsage::print_txt_usage($options, "{explode.params}");
156    print STDERR "You need to specify a plugin";
157    die "\n";
158    }
159   
160    # check metadata set
161    if (defined $metadata_set && $metadata_set =~ /\w/) {
162    $metadata_set .= ".";
163    } else {
164    $metadata_set = "";
165    }
166
167    my $plugobj;
168    require "$plugin.pm";
169    eval ("\$plugobj = new $plugin()");
170    die "$@" if $@;
171
172    # ...and initialize it
173    $plugobj->init(1, "STDERR", "STDERR");
174   
175    if ($input_encoding eq "auto") {
176    ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
177    }
178
179    my $text = "";
180    # Use the plugin's read_file function to avoid duplicating code
181    $plugobj->read_file($filename, $input_encoding, undef, \$text);
182    # is there any text in the file??
183    die "\n" unless length($text);
184
185    # Create a directory to store the document files...
186    my ($documents_directory_base) = ($filename =~ /(.*)\.[^\.]+$/);
187
188    # Split the text into records, using the plugin's split_exp
189    my $split_exp = $plugobj->{'split_exp'};
190    my @metadata_records = split(/$split_exp/, $text);
191    print STDERR "Number of records: " . scalar(@metadata_records) . "\n";
192
193    # Write the metadata from each record to the metadata.xml file
194    my $record_number = 1;
195    my $documents_directory;
196    foreach my $record_text (@metadata_records) {
197    # Check if we need to start a new directory for these records
198    if (($record_number % $records_per_folder) == 1) {
199        $documents_directory = $documents_directory_base;
200        if (scalar(@metadata_records) > $records_per_folder) {
201        $documents_directory .= "." . sprintf("%8.8d", $record_number);
202        }
203        if (-d $documents_directory) {
204        die "Error: document directory $documents_directory already exists (bailing).\n";
205        }
206        &util::mk_dir($documents_directory);
207
208        my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
209        if (-e $documents_metadata_xml_file) {
210        die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
211        }
212
213        # Start the metadata.xml file
214        open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
215        print METADATA_XML_FILE
216        "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
217        "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
218        "<DirectoryMetadata>\n";
219    }
220
221    # Use the plugin's process function to avoid duplicating code
222    my $doc_obj = new doc($filename, "nonindexed_doc");
223    $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
224    # Get all the metadata assigned to this record
225    my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
226    my $document_file;
227   
228    # try to get a doc to attach the metadata to
229    if (defined $document_field) {
230        foreach my $pair (@$record_metadata) {
231        my ($field, $value) = (@$pair);
232        $value =~ s/\\\\/\\/g;
233
234        # Does this metadata element specify a document to obtain?
235        if ($field eq $document_field) {
236            my $document_file_full = $document_prefix . $value . $document_suffix;
237            $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity);
238            &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
239        }
240        }
241    }
242    # Create a dummy .nul file if we haven't obtained any documents for this record
243    if (not defined $document_file) {
244        $document_file = sprintf("%8.8d", $record_number) . ".nul";
245        open(DUMMY_FILE, ">$documents_directory/$document_file");
246        close(DUMMY_FILE);
247        &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
248    }
249
250    if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@metadata_records)) {
251        # Finish and close the metadata.xml file
252        print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
253        close(METADATA_XML_FILE);
254    }
255    $record_number = $record_number + 1;
256    }
257
258    # Explode means just that: the original file is deleted
259    &util::rm($filename);
260    $plugobj->clean_up_after_exploding();
261}
262
263
264sub write_metadata_xml_file_entry
265{
266    my $metadata_xml_file = shift(@_);
267    my $file_name = shift(@_);
268    my $record_metadata = shift(@_);
269    my $meta_prefix = shift(@_);
270   
271    # Make $file_name XML-safe
272    $file_name =~ s/&/&amp;/g;
273    $file_name =~ s/</&lt;/g;
274    $file_name =~ s/>/&gt;/g;
275
276    # Convert $file_name into a regular expression that matches it
277    $file_name =~ s/\./\\\./g;
278    $file_name =~ s/\(/\\\(/g;
279    $file_name =~ s/\)/\\\)/g;
280    $file_name =~ s/\{/\\\{/g;
281    $file_name =~ s/\}/\\\}/g;
282    $file_name =~ s/\[/\\\[/g;
283    $file_name =~ s/\]/\\\]/g;
284   
285    print $metadata_xml_file
286    "\n" .
287        "  <FileSet>\n" .
288    "    <FileName>$file_name</FileName>\n" .
289    "    <Description>\n";
290
291    foreach my $pair (@$record_metadata) {
292    my ($field, $value) = (@$pair);
293
294    # We're only interested in metadata from the database
295    next if ($field eq "lastmodified");
296    next if ($field eq "gsdlsourcefilename");
297    next if ($field eq "gsdldoctype");
298    next if ($field eq "FileFormat");
299
300    # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
301    next if ($field =~ /\^all$/);  # ISISPlug specific!
302
303    # Make $value XML-safe
304    $value =~ s/&/&amp;/g;  # May mess up existing entities!
305    $value =~ s/</&lt;/g;
306    $value =~ s/>/&gt;/g;
307
308    # we are not allowed & in xml except in entities.
309    # if there are undefined entities then parsing will also crap out.
310    # should we be checking for them too?
311    # this may not get all possibilities
312    # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
313
314    print $metadata_xml_file "      <Metadata mode=\"accumulate\" name=\"$meta_prefix$field\">$value</Metadata>\n";
315    }
316
317    print $metadata_xml_file
318    "    </Description>\n" .
319        "  </FileSet>\n";
320}
321
322sub obtain_document
323{
324    my $document_file_full = shift(@_);
325    my $documents_directory = shift(@_);
326    my $verbosity = shift(@_);
327   
328    print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
329
330    my $document_file_name;
331    my $local_document_file;
332
333    # Document specified is on the web
334    if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
335    $document_file_full =~ /([^\/]+)$/;
336    $document_file_name = $1;
337    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
338
339    my $wget_options = "--quiet";
340    $wget_options = "--verbose" if ($verbosity > 2);
341    $wget_options .= " --timestamping";  # Only re-download files if they're newer
342    my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
343    `$wget_command`;
344
345    # Check the document was obtained successfully
346    if (!-e $local_document_file) {
347        print STDERR "WARNING: Could not obtain document file $document_file_full\n";
348    }
349    }
350    # Document specified is on the disk
351    else {
352    my $dir_sep = &util::get_os_dirsep();
353    $document_file_full =~ /(.+$dir_sep)?(.*)$/;
354    $document_file_name = $2;
355    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
356
357    # Only bother trying to copy the file if it contained some path information
358    if ($document_file_full ne $document_file_name) {
359        &util::cp($document_file_full, $documents_directory);
360
361        # Check the document was obtained successfully
362        if (!-e $local_document_file) {
363        print STDERR "WARNING: Could not obtain document file $document_file_full\n";
364        }
365    }
366    }
367
368    # If the document wasn't obtained successfully, create a .nul file for it
369    if (!-e $local_document_file) {
370    $document_file_name .= ".nul";
371    open(NULL_FILE, ">$local_document_file.nul");
372    close(NULL_FILE);
373    }
374
375    return $document_file_name;
376}
377
378&main(@ARGV);
Note: See TracBrowser for help on using the browser.