source: trunk/gsdl/bin/script/explode_metadata_database.pl@ 11332

Last change on this file since 11332 was 11332, checked in by mdewsnip, 18 years ago

Added a mechanism for plugins to do tidying up after exploding. ISISPlug uses this to delete the associated .fdt and .xrf files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1#!/usr/bin/perl
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17
18my $unicode_list =
19 [ { 'name' => "auto",
20 'desc' => "{BasPlug.input_encoding.auto}" },
21 { 'name' => "ascii",
22 'desc' => "{BasPlug.input_encoding.ascii}" },
23 { 'name' => "utf8",
24 'desc' => "{BasPlug.input_encoding.utf8}" },
25 { 'name' => "unicode",
26 'desc' => "{BasPlug.input_encoding.unicode}" } ];
27
28my $e = $encodings::encodings;
29foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
30{
31 my $hashEncode =
32 {'name' => $enc,
33 'desc' => $e->{$enc}->{'name'}};
34
35 push(@{$unicode_list},$hashEncode);
36}
37
38my $arguments =
39 [
40 { 'name' => "language",
41 'desc' => "{scripts.language}",
42 'type' => "string",
43 'reqd' => "no",
44 'hiddengli' => "yes" },
45 { 'name' => "plugin",
46 'desc' => "{explode.plugin}",
47 'type' => "string",
48 'reqd' => "yes",
49 'hiddengli' => "yes"},
50 { 'name' => "input_encoding",
51 'desc' => "{explode.encoding}",
52 'type' => "enum",
53 'deft' => "auto",
54 'list' => $unicode_list,
55 'reqd' => "no" },
56 { 'name' => "metadata_set",
57 'desc' => "{explode.metadata_set}",
58 'type' => "string",
59 'reqd' => "no",
60 'hiddengli' => "yes"},
61 { 'name' => "document_field",
62 'desc' => "{explode.document_field}",
63 'type' => "string",
64 'reqd' => "no"},
65 { 'name' => "document_prefix",
66 'desc' => "{explode.document_prefix}",
67 'type' => "string",
68 'reqd' => "no"},
69 { 'name' => "document_suffix",
70 'desc' => "{explode.document_suffix}",
71 'type' => "string",
72 'reqd' => "no"},
73 { 'name' => "filename_field",
74 'desc' => "{explode.filename_field}",
75 'type' => "string",
76 'reqd' => "no"},
77 { 'name' => "verbosity",
78 'desc' => "{import.verbosity}",
79 'type' => "int",
80 'range' => "0,",
81 'deft' => "1",
82 'reqd' => "no",
83 'modegli' => "4" },
84 { 'name' => "xml",
85 'desc' => "",
86 'type' => "flag",
87 'reqd' => "no",
88 'hiddengli' => "yes" }
89 ];
90
91my $options = { 'name' => "explode_metadata_database.pl",
92 'desc' => "{explode.desc}",
93 'args' => $arguments };
94
95
96sub main
97{
98 my ($language, $input_encoding, $metadata_set, $plugin, $filename_field,
99 $document_field, $document_prefix, $document_suffix, $verbosity);
100
101 my $xml = 0;
102
103 my $hashParsingResult = {};
104 my $blnParseFailed = "false";
105 # parse the options
106 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
107
108 foreach my $strVariable (keys %$hashParsingResult)
109 {
110 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
111 }
112
113 # If $language has been specified, load the appropriate resource bundle
114 # (Otherwise, the default resource bundle will be loaded automatically)
115 if ($language && $language =~ /\S/) {
116 &gsprintf::load_language_specific_resource_bundle($language);
117 }
118
119 if ($xml) {
120 &PrintUsage::print_xml_usage($options);
121 print "\n";
122 return;
123 }
124
125 # There should one arg left after parsing
126 if($intArgLeftinAfterParsing > 1)
127 {
128 &PrintUsage::print_txt_usage($options, "{explode.params}");
129 die "\n";
130 }
131
132 # The metadata database filename is the first value that remains after the options have been parsed out
133 my $filename = $ARGV[0];
134 if (!defined $filename || $filename !~ /\w/) {
135 &PrintUsage::print_txt_usage($options, "{explode.params}");
136 print STDERR "You need to specify a filename";
137 die "\n";
138 }
139 # check that file exists
140 if (!-e $filename) {
141 print STDERR "File $filename doesn't exist...\n";
142 die "\n";
143 }
144 # check required options
145 if (!defined $plugin || $plugin !~ /\w/) {
146 &PrintUsage::print_txt_usage($options, "{explode.params}");
147 print STDERR "You need to specify a plugin";
148 die "\n";
149 }
150
151 # check metadata set
152 if (defined $metadata_set && $metadata_set =~ /\w/) {
153 $metadata_set .= ".";
154 } else {
155 $metadata_set = "";
156 }
157
158 #check filename field
159 if (defined $filename_field && $filename_field eq "") {
160 undef $filename_field;
161 }
162 my $plugobj;
163 require "$plugin.pm";
164 eval ("\$plugobj = new $plugin()");
165 die "$@" if $@;
166
167 # ...and initialize it
168 $plugobj->init(1, "STDERR", "STDERR");
169
170 if ($input_encoding eq "auto") {
171 $plugobj->{'input_encoding'} = $input_encoding;
172 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
173 }
174 my $text = "";
175 # Use the plugin's read_file function to avoid duplicating code
176 $plugobj->read_file($filename, $input_encoding, undef, \$text);
177
178 # Create a directory to store the document files...
179 my ($documents_directory) = ($filename =~ /(.*)\.[^\.]+$/);
180 if (-d $documents_directory) {
181 die "Error: document directory $documents_directory already exists (bailing).\n";
182 }
183 &util::mk_dir($documents_directory);
184
185 # ...and a metadata.xml file for the document metadata (extracted from the database)
186 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
187 if (-e $documents_metadata_xml_file) {
188 die "Error: document metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
189 }
190
191 # Start the metadata.xml file
192 open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
193 print METADATA_XML_FILE
194 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
195 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
196 "<DirectoryMetadata>\n";
197
198 # Split the text into records, using the plugin's split_exp
199 my $split_exp = $plugobj->{'split_exp'};
200 my @metadata_records = split(/$split_exp/, $text);
201 print STDERR "Number of records: " . @metadata_records . "\n";
202
203 # Write the metadata from each record to the metadata.xml file
204 my $record_number = 0;
205 foreach my $record_text (@metadata_records) {
206
207 # Use the plugin's process function to avoid duplicating code
208 my $doc_obj = new doc($filename, "nonindexed_doc");
209 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
210 # Get all the metadata assigned to this record
211 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
212 my $document_file;
213
214 # try to get a doc to attach the metadata to
215 if (defined $document_field) {
216 foreach my $pair (@$record_metadata) {
217 my ($field, $value) = (@$pair);
218
219 # Does this metadata element specify a document to obtain?
220 if ($field eq $document_field) {
221 my $document_file_full = $document_prefix . $value . $document_suffix;
222 $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity);
223 }
224 }
225 }
226 # do we need to create a dummy doc??
227 if (not defined $document_file) {
228 # try to get a file name
229 if (defined $filename_field) {
230
231 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field);
232 if (defined $meta) {
233 $meta =~ s/&\w{1,10};//g; # remove entities
234 $document_file = "$meta.nul";
235 my $num = 0;
236 while (-e "$documents_directory/$document_file") {
237 $num++;
238 $document_file = "$meta$num.nul";
239 }
240 } else {
241 $record_number = $record_number + 1;
242 $document_file = sprintf("%4.4d", $record_number) . ".nul";
243 }
244 } else {
245 $record_number = $record_number + 1;
246 $document_file = sprintf("%4.4d", $record_number) . ".nul";
247 }
248
249 open(DUMMY_FILE, ">$documents_directory/$document_file");
250 close(DUMMY_FILE);
251 }
252
253 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
254 }
255
256 # Finish and close the metadata.xml file
257 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
258 close(METADATA_XML_FILE);
259
260 # Explode means just that: the original file is deleted
261 &util::rm($filename);
262 $plugobj->clean_up_after_exploding();
263}
264
265
266sub write_metadata_xml_file_entry
267{
268 my $metadata_xml_file = shift(@_);
269 my $file_name = shift(@_);
270 my $record_metadata = shift(@_);
271 my $meta_prefix = shift(@_);
272
273 # Make $file_name XML-safe
274 $file_name =~ s/</&lt;/g;
275 $file_name =~ s/>/&gt;/g;
276
277 # Convert $file_name into a regular expression that matches it
278 $file_name =~ s/\./\\\./g;
279 $file_name =~ s/\(/\\\(/g;
280 $file_name =~ s/\)/\\\)/g;
281 $file_name =~ s/\{/\\\{/g;
282 $file_name =~ s/\}/\\\}/g;
283 $file_name =~ s/\[/\\\[/g;
284 $file_name =~ s/\]/\\\]/g;
285
286 print $metadata_xml_file
287 "\n" .
288 " <FileSet>\n" .
289 " <FileName>$file_name</FileName>\n" .
290 " <Description>\n";
291
292 foreach my $pair (@$record_metadata) {
293 my ($field, $value) = (@$pair);
294
295 # We're only interested in metadata from the database
296 next if ($field eq "lastmodified");
297 next if ($field eq "gsdlsourcefilename");
298 next if ($field eq "gsdldoctype");
299 next if ($field eq "FileFormat");
300
301 # Ignore the ^all metadata, since it will be invalid if the source metadata is changed
302 next if ($field =~ /\^all$/); # ISISPlug specific!
303
304 # Make $value XML-safe
305 $value =~ s/&/&amp;/g; # May mess up existing entities!
306 $value =~ s/</&lt;/g;
307 $value =~ s/>/&gt;/g;
308
309 # we are not allowed & in xml except in entities.
310 # if there are undefined entities then parsing will also crap out.
311 # should we be checking for them too?
312 # this may not get all possibilities
313 # $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;
314
315 print $metadata_xml_file " <Metadata mode=\"accumulate\" name=\"$meta_prefix$field\">$value</Metadata>\n";
316 }
317
318 print $metadata_xml_file
319 " </Description>\n" .
320 " </FileSet>\n";
321}
322
323sub obtain_document
324{
325 my $document_file_full = shift(@_);
326 my $documents_directory = shift(@_);
327 my $verbosity = shift(@_);
328
329 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
330
331 my $document_file_name;
332 my $local_document_file;
333
334 # Document specified is on the web
335 if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
336 $document_file_full =~ /([^\/]+)$/;
337 $document_file_name = $1;
338 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
339
340 my $wget_options = "--quiet";
341 $wget_options = "--verbose" if ($verbosity > 2);
342 $wget_options .= " --timestamping"; # Only re-download files if they're newer
343 `wget $wget_options $document_file_full --output-document $local_document_file`;
344 }
345 # Document specified is on the disk
346 else {
347 my $dir_sep = &util::get_os_dirsep();
348 $document_file_full =~ /(.+$dir_sep)?(.*)$/;
349 $document_file_name = $2;
350 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
351
352 &util::cp($document_file_full, $documents_directory);
353 }
354
355 # Check the document was obtained successfully
356 if (!-e $local_document_file) {
357 print STDERR "WARNING: Could not obtain document file $document_file_full\n";
358
359 $document_file_name .= ".nul";
360 open(NULL_FILE, ">$local_document_file.nul");
361 close(NULL_FILE);
362 }
363
364 return $document_file_name;
365}
366
367&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.