Changeset 9147
- Timestamp:
- 2005-02-23T14:01:04+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/explode_metadata_database.pl
r9121 r9147 25 25 'desc' => "{explode.encoding}", 26 26 'type' => "enum", 27 'deft' => " iso_8859_1",27 'deft' => "auto", 28 28 'list' => $unicode_list, 29 'reqd' => "no" } 29 'reqd' => "no" }, 30 30 { 'name' => "metadata_set", 31 31 'desc' => "{explode.metadata_set}", 32 32 'type' => "string", 33 'reqd' => "no" } , 33 'reqd' => "no", 34 'hiddengli' => "yes"}, 34 35 { 'name' => "plugin", 35 36 'desc' => "{explode.plugin}", 36 37 'type' => "string", 37 'reqd' => "yes" }, 38 'reqd' => "yes", 39 'hiddengli' => "yes"}, 40 { 'name' => "document_field", 41 'desc' => "{explode.document_field}", 42 'type' => "string", 43 'reqd' => "no"}, 44 { 'name' => "document_prefix", 45 'desc' => "{explode.document_prefix}", 46 'type' => "string", 47 'reqd' => "no"}, 48 { 'name' => "document_suffix", 49 'desc' => "{explode.document_suffix}", 50 'type' => "string", 51 'reqd' => "no"}, 38 52 { 'name' => "filename_field", 39 53 'desc' => "{explode.filename_field}", 40 54 'type' => "string", 41 'reqd' => "no"} 55 'reqd' => "no"} 42 56 ]; 43 57 … … 49 63 sub main 50 64 { 51 my ($encoding, $metadata_set, $plugin, $filename_field); 52 53 65 my ($encoding, $metadata_set, $plugin, $filename_field, 66 $document_field, $document_prefix, $document_suffix); 67 68 my $xml = 0; 54 69 # Parse command line arguments 55 70 if (!parsargv::parse(\@ARGV, 71 'language/.*/', \$language, 56 72 'input_encoding/.*/auto', \$encoding, 57 73 'metadata_set/.*/', \$metadata_set, 58 74 'plugin/.*/', \$plugin, 59 'filename_field/.*/', \$filename_field)) { 75 'filename_field/.*/', \$filename_field, 76 'document_field/.*/', \$document_field, 77 'document_prefix/.*/', \$document_prefix, 78 'document_suffix/.*/', \$document_suffix, 79 q^xml^, \$xml)) { 60 80 &PrintUsage::print_txt_usage($options, "{explode.params}"); 61 81 die "\n"; 82 } 83 84 # If $language has been specified, load the appropriate resource bundle 85 # (Otherwise, the default resource bundle will be loaded automatically) 86 if ($language) { 87 &gsprintf::load_language_specific_resource_bundle($language); 88 } 89 90 if ($xml) { 91 &PrintUsage::print_xml_usage($options); 92 print "\n"; 93 return; 62 94 } 63 95 … … 89 121 90 122 #check filename field 91 123 if (defined $fileanme_field && $filenmae_field eq "") { 124 undef $filename_field; 125 } 92 126 my $plugobj; 93 127 require "$plugin.pm"; … … 137 171 my $doc_obj = new doc($filename, "nonindexed_doc"); 138 172 $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0); 139 # try to get a file name 173 # Get all the metadata assigned to this record 174 my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section()); 140 175 my $document_file; 141 if (defined $filename_field) { 142 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field); 143 if (defined $meta) { 144 $meta =~ s/&\w{1,10};//g; # remove entities 145 $document_file = "$meta.nul"; 146 my $num = 0; 147 while (-e "$documents_directory/$document_file") { 148 $num++; 149 $document_file = "$meta$num.nul"; 176 177 # try to get a doc to attach the metadata to 178 if (defined $document_field) { 179 foreach my $pair (@$record_metadata) { 180 my ($field, $value) = (@$pair); 181 182 # Does this metadata element specify a document to obtain? 183 if ($field eq $document_field) { 184 my $document_file_full = $document_prefix . $value . $document_suffix; 185 $document_file = &obtain_document($self, $document_file_full, $documents_directory); 186 } 187 } 188 } 189 # do we need to create a dummy doc?? 190 if (not defined $document_file) { 191 # try to get a file name 192 if (defined $filename_field) { 193 my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field); 194 if (defined $meta) { 195 $meta =~ s/&\w{1,10};//g; # remove entities 196 $document_file = "$meta.nul"; 197 my $num = 0; 198 while (-e "$documents_directory/$document_file") { 199 $num++; 200 $document_file = "$meta$num.nul"; 201 } 202 } else { 203 $record_number = $record_number + 1; 204 $document_file = sprintf("%4.4d", $record_number) . ".nul"; 150 205 } 151 206 } else { … … 153 208 $document_file = sprintf("%4.4d", $record_number) . ".nul"; 154 209 } 155 } else {156 $record_number = $record_number + 1;157 $document_file = sprintf("%4.4d", $record_number) . ".nul"; 210 open(DUMMY_FILE, ">$documents_directory/$document_file"); 211 close(DUMMY_FILE); 212 158 213 } 159 214 160 open(DUMMY_FILE, ">$documents_directory/$document_file"); 161 close(DUMMY_FILE); 162 163 # Look at all the metadata assigned to this record 164 my $record_metadata = $doc_obj->get_all_metadata($cursection); 215 165 216 &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set); 166 217 } … … 171 222 172 223 # Explode means just that: the original file is deleted 173 #&util::rm($filename);224 &util::rm($filename); 174 225 } 175 226 … … 231 282 } 232 283 284 sub obtain_document 285 { 286 my $self = shift(@_); 287 my $document_file_full = shift(@_); 288 my $documents_directory = shift(@_); 289 290 my $outhandle = $self->{'outhandle'}; 291 print $outhandle "Obtaining document file $document_file_full...\n" 292 if ($self->{'verbosity'} > 1); 293 294 my $document_file_name; 295 my $local_document_file; 296 297 # Document specified is on the web 298 if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) { 299 $document_file_full =~ /([^\/]+)$/; 300 $document_file_name = $1; 301 $local_document_file = &util::filename_cat($documents_directory, $document_file_name); 302 303 my $wget_options = "--quiet"; 304 $wget_options = "--verbose" if ($self->{'verbosity'} > 2); 305 $wget_options .= " --timestamping"; # Only re-download files if they're newer 306 `wget $wget_options $document_file_full --output-document $local_document_file`; 307 } 308 # Document specified is on the disk 309 else { 310 my $dir_sep = &util::get_os_dirsep(); 311 $document_file_full =~ /(.+$dir_sep)?(.*)$/; 312 $document_file_name = $2; 313 $local_document_file = &util::filename_cat($documents_directory, $document_file_name); 314 315 &util::cp($document_file_full, $documents_directory); 316 } 317 318 # Check the document was obtained successfully 319 if (!-e $local_document_file) { 320 print STDERR "WARNING: Could not obtain document file $document_file_full\n"; 321 return undef; 322 } 323 324 return $document_file_name; 325 } 233 326 234 327 &main(@ARGV);
Note:
See TracChangeset
for help on using the changeset viewer.