Changeset 17026 for gsdl/trunk/perllib
- Timestamp:
- 2008-08-27T20:40:20+12:00 (16 years ago)
- Location:
- gsdl/trunk/perllib/plugins
- Files:
-
- 19 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/AbstractPlugin.pm
r16388 r17026 79 79 } 80 80 81 # called once, at the start of processing 81 82 sub init { 82 83 my $self = shift (@_); … … 92 93 } 93 94 95 # called at the beginning of each plugin pass (import has one, buildin has many) 94 96 sub begin { 97 my $self = shift (@_); 98 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 95 99 96 100 } 97 101 102 # called at the end of each plugin pass 98 103 sub end { 104 my ($self) = shift (@_); 99 105 100 106 } 101 107 108 # called once, after all passes have finished 102 109 sub deinit { 110 my ($self) = @_; 103 111 104 112 } -
gsdl/trunk/perllib/plugins/AutoExtractMetadata.pm
r16698 r17026 89 89 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 90 90 91 $self->SUPER::begin(@_); 92 91 93 #initialise those extractors that need initialisation 92 94 $self->initialise_acronym_extractor(); 93 95 $self->initialise_gis_extractor(); 96 94 97 } 95 98 -
gsdl/trunk/perllib/plugins/BasePlugin.pm
r16997 r17026 79 79 80 80 our $oidtype_list = 81 [ { 'name' => "hash", 81 [ { 'name' => "auto", 82 'desc' => "{BasePlugin.OIDtype.auto}" }, 83 { 'name' => "hash", 82 84 'desc' => "{import.OIDtype.hash}" }, 83 85 { 'name' => "assigned", … … 116 118 'list' => $oidtype_list, 117 119 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl 118 #'deft' => "hash",120 'deft' => "auto", 119 121 'reqd' => "no", 120 122 'modegli' => "2" }, … … 282 284 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 283 285 286 if ($self->{'OIDtype'} eq "auto") { 287 # hasn't been set in the plugin, use the processor values 288 $self->{'OIDtype'} = $processor->{'OIDtype'}; 289 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'}; 290 } 291 if ($self->{'OIDtype'} eq "hash") { 292 # should we hash on the file or on the doc xml?? 293 $self->{'OIDtype'} = $self->get_oid_hash_type(); 294 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) { 295 $self->{'OIDtype'} = "hash_on_file"; 296 } 297 } 284 298 } 285 299 … … 295 309 296 310 my ($self) = @_; 311 } 312 313 # default hashing type is to hash on the original file (or converted file) 314 # override this to return hash_on_ga_xml for filetypes where hashing on the 315 # file is no good eg video 316 sub get_oid_hash_type { 317 318 my $self = shift (@_); 319 320 return "hash_on_file"; 297 321 } 298 322 … … 715 739 my ($doc_obj, $filename_no_path, $file_encoding) = @_; 716 740 741 717 742 my $top_section = $doc_obj->get_top_section(); 718 743 … … 729 754 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta); 730 755 } 731 756 757 # this should be called by all plugins to set the oid of the doc obj, rather 758 # than calling doc_obj->set_OID directly 732 759 sub add_OID { 733 760 my $self = shift (@_); 734 761 my ($doc_obj) = @_; 735 762 736 # See if a metadata field is specified as the field 737 if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) { 738 my $metadata_doc_id = $self->{'use_as_doc_identifier'}; 739 740 # Consider "tidying" up metadata_doc_id to be something 741 # suitable in a URL 742 # Could even support a user specified plugin RE for this. 743 744 my $top_section = $doc_obj->get_top_section(); 745 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id); 746 $doc_obj->set_OID($oid); 747 } 748 # See if there is a plugin-specific set_OID function... 749 elsif (defined ($self->can('set_OID'))) { 750 # it will need $doc_obj to set the Identifier metadata... 751 $self->set_OID(@_); # pass through any extra arguments supplied 752 } else { 763 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'}); 764 765 # see if there is a plugin specific set_OID function 766 if (defined ($self->can('set_OID'))) { 767 $self->set_OID(@_); # pass through doc_obj and any extra arguments 768 } 769 else { 753 770 # use the default set_OID() in doc.pm 754 771 $doc_obj->set_OID(); 755 772 } 756 } 757 758 759 773 774 } 775 760 776 # The BasePlugin read_into_doc_obj() function. This function does all the 761 777 # right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in … … 791 807 my $top_section = $doc_obj->get_top_section(); 792 808 793 # this should look at the plugin option too...794 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});795 809 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 796 810 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); -
gsdl/trunk/perllib/plugins/BibTexPlugin.pm
r16104 r17026 815 815 } 816 816 817 sub set_OID {817 sub add_OID { 818 818 my $self = shift (@_); 819 819 my ($doc_obj, $id, $segment_number) = @_; 820 820 821 821 if ( $self->{'key'} eq "default") { 822 $doc_obj->set_OID("$id\_$segment_number"); 822 $self->SUPER::add_OID(@_); 823 # $doc_obj->set_OID("$id\_$segment_number"); 823 824 } else { 824 825 $doc_obj->set_OID($self->{'key'}); -
gsdl/trunk/perllib/plugins/ConvertToRogPlugin.pm
r16580 r17026 344 344 my $docnum = $self->{'docnum'}; 345 345 346 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});347 346 my ($filemeta) = $file =~ /([^\\\/]+)$/; 348 347 $self->set_Source_metadata($doc_obj, $filemeta); … … 392 391 $self->auto_extract_metadata ($doc_obj); 393 392 # add an OID 394 $ doc_obj->set_OID();393 $self->add_OID($doc_obj); 395 394 396 395 my $oid = $doc_obj->get_OID(); -
gsdl/trunk/perllib/plugins/DBPlugin.pm
r16392 r17026 247 247 248 248 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref? 249 249 250 my $base_oid = undef; 250 251 while (scalar(@row_array)) { 251 252 if (defined($dbplug_debug) && $dbplug_debug==1) { … … 255 256 # create a new document 256 257 my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 257 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 258 258 259 my $cursection = $doc_obj->get_top_section(); 259 260 … … 333 334 334 335 if (!defined $unique_id) { 335 $doc_obj->set_OID(); 336 my $id = $doc_obj->get_OID(); 337 $doc_obj->set_OID($id."s$count"); 336 if (!defined $base_oid) { 337 $self->add_OID($doc_obj); 338 $base_oid = $doc_obj->get_OID(); 339 } 340 $doc_obj->set_OID($base_oid."s$count"); 338 341 } else { 339 342 # use our id from the database... -
gsdl/trunk/perllib/plugins/EmailPlugin.pm
r16677 r17026 80 80 } 81 81 82 my $extended_oidtype_list = 83 [ {'name' => "message_id", 84 'desc' => "{EmailPlugin.OIDtype.message_id}" } 85 ]; 86 87 # add in all the standard options from BasePlugin 88 push (@$extended_oidtype_list, @{$BasePlugin::oidtype_list}); 82 89 83 90 my $arguments = … … 95 102 'type' => "flag", 96 103 'reqd' => "no" }, 104 { 'name' => "OIDtype", 105 'desc' => "{import.OIDtype}", 106 'type' => "enum", 107 'list' => $extended_oidtype_list, 108 'deft' => "messsage_id", 109 'reqd' => "no", 110 'modegli' => "2" }, 111 { 'name' => "OIDmetadata", 112 'desc' => "{import.OIDmetadata}", 113 'type' => "metadata", 114 'deft' => "dc.Identifier", 115 'reqd' => "no", 116 'modegli' => "2" }, 97 117 { 'name' => "split_exp", 98 118 'desc' => "{EmailPlugin.split_exp}", … … 1161 1181 } 1162 1182 1163 1164 sub set_OID { 1183 sub get_base_OID { 1184 my $self = shift(@_); 1185 my ($doc_obj) = @_; 1186 1187 ## TODO: need a valid id in case there is no message id... 1188 if ($self->{'OIDtype'} eq "message_id") { 1189 # temporarily set OIDtype to hash to get a base id 1190 $self->{'OIDtype'} = "hash_on_ga_xml"; 1191 $self->add_OID($doc_obj); 1192 my $id = $doc_obj->get_OID(); 1193 $self->{'OIDtype'} = "message_id"; 1194 return $id; 1195 } 1196 return $self->SUPER::get_base_OID(@_); 1197 } 1198 1199 1200 sub add_OID { 1165 1201 my $self = shift (@_); 1166 1202 my ($doc_obj, $id, $segment_number) = @_; 1167 1203 1168 if ( exists $doc_obj->{'msgid'} ) {1204 if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) { 1169 1205 $doc_obj->set_OID($doc_obj->{'msgid'}); 1170 } else { 1206 } 1207 else { 1171 1208 $doc_obj->set_OID("$id\_$segment_number"); 1172 1209 } -
gsdl/trunk/perllib/plugins/FOXPlugin.pm
r16392 r17026 270 270 # create a new document 271 271 my $doc_obj = new doc ($file, "indexed_doc"); 272 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 272 273 273 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 274 274 my $section = $doc_obj->get_top_section(); … … 294 294 295 295 # add an object id 296 $ doc_obj->set_OID();296 $self->add_OID($doc_obj); 297 297 298 298 # process the document -
gsdl/trunk/perllib/plugins/HBPlugin.pm
r16392 r17026 241 241 # create a new document 242 242 my $doc_obj = new doc ($file, "indexed_doc"); 243 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});244 243 245 244 # copy the book cover if it exists … … 322 321 323 322 # add a OID 324 $ doc_obj->set_OID ();323 $self->add_OID($doc_obj); 325 324 326 325 # process the document -
gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm
r16697 r17026 158 158 print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; 159 159 print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'}; 160 161 # reset the base id 162 $self->{'base_oid'} = undef; 160 163 161 164 } … … 193 196 my $file = $self->{'file'}; 194 197 my $doc_obj = new doc($filename); 195 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});196 198 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 197 199 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); … … 211 213 $self->{'doc_obj'} = $doc_obj; 212 214 $self->{'num_processed'}++; 215 if (!defined $self->{'base_oid'}) { 216 $self->SUPER::add_OID($doc_obj); 217 $self->{'base_oid'} = $doc_obj->get_OID(); 218 } 219 213 220 214 221 } … … 309 316 310 317 311 $self->add_OID($doc_obj, $self->{'record_count'});318 $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'}); 312 319 313 320 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'}); … … 396 403 } 397 404 398 399 sub set_OID { 405 sub add_OID { 400 406 my $self = shift (@_); 401 my ($doc_obj, $record_number) = @_; 402 403 # first set it to generate hash value 404 $doc_obj->set_OID(); 405 406 # then top it up with an "r" + record-number suffix 407 my $id = $doc_obj->get_OID(); 408 $doc_obj->set_OID($id . "r" . $record_number); 407 my ($doc_obj, $id, $record_number) = @_; 408 409 my $full_id = $id . "r" . $record_number; 410 if ($self->{'OIDtype'} eq "assigned") { 411 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'}); 412 if (defined $identifier && $identifier ne "") { 413 $OID = $identifier; 414 $OID =~ s/\.//; #remove any periods 415 if ($OID =~ /^[\d]*$/) { 416 $OID = "D" . $OID; 417 print STDERR "OID only contains numbers, adding a D\n"; 418 } 419 $full_id = $identifier; 420 } 421 } 422 $doc_obj->set_OID($full_id); 409 423 } 410 424 -
gsdl/trunk/perllib/plugins/MP3Plugin.pm
r16952 r17026 85 85 } 86 86 87 # we don't want to hash on the file 88 sub get_oid_hash_type { 89 my $self = shift (@_); 90 return "hash_on_ga_xml"; 91 } 92 87 93 sub process { 88 94 my $self = shift (@_); … … 90 96 91 97 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 92 # do something about OIDtype so no hashing93 94 # old code was in effect the following.95 if ($doc_obj->{'OIDtype'} =~ /^hash$/) {96 $doc_obj->set_OIDtype ("incremental");97 }98 99 98 100 99 # associate the file with the document -
gsdl/trunk/perllib/plugins/OAIPlugin.pm
r16392 r17026 239 239 240 240 # add an OID 241 $ doc_obj->set_OID();241 $self->add_OID($doc_obj); 242 242 243 243 my $ppmd_table = $self->{'ppmd_table'}; -
gsdl/trunk/perllib/plugins/OggVorbisPlugin.pm
r16960 r17026 81 81 } 82 82 83 # we don't want to hash on the file 84 sub get_oid_hash_type { 85 my $self = shift (@_); 86 return "hash_on_ga_xml"; 87 } 88 83 89 sub process 84 90 { … … 87 93 88 94 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 89 # do something about OIDtype so no hashing90 91 # old code was in effect the following.92 if ($doc_obj->{'OIDtype'} =~ /^hash$/) {93 $doc_obj->set_OIDtype ("incremental");94 }95 95 96 96 my $top_section = $doc_obj->get_top_section(); -
gsdl/trunk/perllib/plugins/PagedImagePlugin.pm
r16849 r17026 497 497 my ($doc_obj, $filename_no_path, $processor) = @_; 498 498 499 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});500 499 my $topsection = $doc_obj->get_top_section(); 501 500 -
gsdl/trunk/perllib/plugins/ReadTextFile.pm
r16765 r17026 137 137 138 138 # this should look at the plugin option too... 139 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});140 139 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 141 140 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); -
gsdl/trunk/perllib/plugins/ReadXMLFile.pm
r16822 r17026 199 199 } 200 200 201 202 # we need to implement read cos we are not just using process_exp to determine203 # whether to process this or not.204 201 sub read { 205 202 my $self = shift (@_); … … 363 360 # create a new document 364 361 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); 365 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});366 362 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 367 363 … … 385 381 386 382 # add an OID 387 $self->add_OID( );383 $self->add_OID($doc_obj); 388 384 389 385 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); -
gsdl/trunk/perllib/plugins/RealMediaPlugin.pm
r16994 r17026 77 77 } 78 78 79 # we don't want to hash on the file 80 sub get_oid_hash_type { 81 my $self = shift (@_); 82 return "hash_on_ga_xml"; 83 } 84 79 85 sub process 80 86 { … … 84 90 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 85 91 my $top_section = $doc_obj->get_top_section(); 86 # prevent hashing: old code was in effect the following.87 if ($doc_obj->{'OIDtype'} =~ /^hash$/) {88 $doc_obj->set_OIDtype ("incremental");89 }90 92 91 93 my $text = ""; -
gsdl/trunk/perllib/plugins/RogPlugin.pm
r16392 r17026 204 204 205 205 # add OID 206 $ doc_obj->set_OID ();206 $self->add_OID($doc_obj); 207 207 208 208 my $oid = $doc_obj->get_OID(); -
gsdl/trunk/perllib/plugins/SplitTextFile.pm
r16700 r17026 234 234 # create a new document 235 235 my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 236 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});237 236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 238 237 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); … … 248 247 # Calculate a "base" document ID. 249 248 if (!defined $id) { 250 $doc_obj->set_OID(); 251 $id = $doc_obj->get_OID(); 249 $id = $self->get_base_OID($doc_obj); 252 250 } 253 251 … … 274 272 275 273 # add an OID 276 $self-> set_OID($doc_obj, $id, $segment);274 $self->add_OID($doc_obj, $id, $segment); 277 275 278 276 # process the document … … 288 286 } 289 287 290 sub set_OID { 288 sub get_base_OID { 289 my $self = shift(@_); 290 my ($doc_obj) = @_; 291 292 $self->SUPER::add_OID($doc_obj); 293 return $doc_obj->get_OID(); 294 } 295 296 sub add_OID { 291 297 my $self = shift (@_); 292 my ($doc_obj, $id, $segment_number) = @_; 293 294 $doc_obj->set_OID($id . "s" . $segment_number); 295 } 298 my ($doc_obj, $id, $segment) = @_; 299 300 my $full_id = $id . "s" . $segment; 301 if ($self->{'OIDtype'} eq "assigned") { 302 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'}); 303 if (defined $identifier && $identifier ne "") { 304 $full_id = $identifier; 305 $full_id =~ s/\.//; #remove any periods 306 if ($full_id =~ /^[\d]*$/) { 307 $full_id = "D" . $full_id; 308 print STDERR "OID only contains numbers, adding a D\n"; 309 } 310 } 311 } 312 $doc_obj->set_OID($full_id); 313 } 314 296 315 297 316 1;
Note:
See TracChangeset
for help on using the changeset viewer.