Changeset 10338
- Timestamp:
- 2005-07-28T11:31:20+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/explode_metadata_database.pl
r9147 r10338 8 8 } 9 9 10 11 use parsargv; 10 use strict; 11 no strict 'subs'; # allow barewords (eg STDERR) as function arguments 12 no strict 'refs'; # allow filehandles to be variables and vice versa 12 13 use printusage; 14 use parse2; 15 13 16 my $unicode_list = 14 17 [ { 'name' => "auto", … … 22 25 23 26 my $arguments = 24 [ { 'name' => "input_encoding", 27 [ 28 { 'name' => "plugin", 29 'desc' => "{explode.plugin}", 30 'type' => "string", 31 'reqd' => "yes", 32 'hiddengli' => "yes"}, 33 { 'name' => "input_encoding", 25 34 'desc' => "{explode.encoding}", 26 35 'type' => "enum", … … 33 42 'reqd' => "no", 34 43 'hiddengli' => "yes"}, 35 { 'name' => "plugin",36 'desc' => "{explode.plugin}",37 'type' => "string",38 'reqd' => "yes",39 'hiddengli' => "yes"},40 44 { 'name' => "document_field", 41 45 'desc' => "{explode.document_field}", … … 53 57 'desc' => "{explode.filename_field}", 54 58 'type' => "string", 55 'reqd' => "no"} 59 'reqd' => "no"}, 60 { 'name' => "verbosity", 61 'desc' => "{import.verbosity}", 62 'type' => "int", 63 'range' => "0,", 64 'deft' => "1", 65 'reqd' => "no", 66 'modegli' => "4" }, 67 { 'name' => "xml", 68 'desc' => "", 69 'type' => "flag", 70 'reqd' => "no", 71 'hiddengli' => "yes" } 56 72 ]; 57 73 … … 63 79 sub main 64 80 { 65 my ($ encoding, $metadata_set, $plugin, $filename_field,66 $document_field, $document_prefix, $document_suffix );81 my ($language, $input_encoding, $metadata_set, $plugin, $filename_field, 82 $document_field, $document_prefix, $document_suffix, $verbosity); 67 83 68 84 my $xml = 0; 69 # Parse command line arguments 70 if (!parsargv::parse(\@ARGV, 71 'language/.*/', \$language, 72 'input_encoding/.*/auto', \$encoding, 73 'metadata_set/.*/', \$metadata_set, 74 'plugin/.*/', \$plugin, 75 'filename_field/.*/', \$filename_field, 76 'document_field/.*/', \$document_field, 77 'document_prefix/.*/', \$document_prefix, 78 'document_suffix/.*/', \$document_suffix, 79 q^xml^, \$xml)) { 85 86 my $hashParsingResult = {}; 87 my $blnParseFailed = "false"; 88 # parse the options 89 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options"); 90 # There should one arg left after parsing 91 if($intArgLeftinAfterParsing > 1) 92 { 80 93 &PrintUsage::print_txt_usage($options, "{explode.params}"); 81 94 die "\n"; 82 95 } 83 96 97 foreach my $strVariable (keys %$hashParsingResult) 98 { 99 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}"; 100 } 101 84 102 # If $language has been specified, load the appropriate resource bundle 85 103 # (Otherwise, the default resource bundle will be loaded automatically) 86 if ($language ) {104 if ($language && $language =~ /\S/) { 87 105 &gsprintf::load_language_specific_resource_bundle($language); 88 106 } … … 121 139 122 140 #check filename field 123 if (defined $file anme_field && $filenmae_field eq "") {141 if (defined $filename_field && $filename_field eq "") { 124 142 undef $filename_field; 125 143 } … … 132 150 $plugobj->init(1, "STDERR", "STDERR"); 133 151 134 if ($encoding eq "auto") { 135 $plugobj->{'input_encoding'} = $encoding; 136 (my $language, $encoding) = $plugobj->textcat_get_language_encoding ($filename); 137 } 152 if ($input_encoding eq "auto") { 153 $plugobj->{'input_encoding'} = $input_encoding; 154 ($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename); 155 } 156 my $text = ""; 138 157 # Use the plugin's read_file function to avoid duplicating code 139 $plugobj->read_file($filename, $ encoding, undef, \$text);158 $plugobj->read_file($filename, $input_encoding, undef, \$text); 140 159 141 160 # Create a directory to store the document files... … … 147 166 148 167 # ...and a metadata.xml file for the document metadata (extracted from the database) 149 $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");168 my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml"); 150 169 if (-e $documents_metadata_xml_file) { 151 170 die "Error: document metadata.xml file $documents_metadata_xml_file already exists (bailing).\n"; … … 166 185 # Write the metadata from each record to the metadata.xml file 167 186 my $record_number = 0; 168 foreach $record_text (@metadata_records) {187 foreach my $record_text (@metadata_records) { 169 188 170 189 # Use the plugin's process function to avoid duplicating code … … 183 202 if ($field eq $document_field) { 184 203 my $document_file_full = $document_prefix . $value . $document_suffix; 185 $document_file = &obtain_document($ self, $document_file_full, $documents_directory);204 $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity); 186 205 } 187 206 } … … 191 210 # try to get a file name 192 211 if (defined $filename_field) { 212 193 213 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field); 194 214 if (defined $meta) { … … 284 304 sub obtain_document 285 305 { 286 my $self = shift(@_);287 306 my $document_file_full = shift(@_); 288 307 my $documents_directory = shift(@_); 289 290 my $outhandle = $self->{'outhandle'}; 291 print $outhandle "Obtaining document file $document_file_full...\n" 292 if ($self->{'verbosity'} > 1); 308 my $verbosity = shift(@_); 309 310 print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1); 293 311 294 312 my $document_file_name; … … 302 320 303 321 my $wget_options = "--quiet"; 304 $wget_options = "--verbose" if ($ self->{'verbosity'}> 2);322 $wget_options = "--verbose" if ($verbosity > 2); 305 323 $wget_options .= " --timestamping"; # Only re-download files if they're newer 306 324 `wget $wget_options $document_file_full --output-document $local_document_file`;
Note:
See TracChangeset
for help on using the changeset viewer.