Changeset 17026

Show
Ignore:
Timestamp:
27.08.2008 20:40:20 (11 years ago)
Author:
kjdon
Message:

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis?...)

Location:
gsdl/trunk/perllib/plugins
Files:
19 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/AbstractPlugin.pm

    r16388 r17026  
    7979} 
    8080 
     81# called once, at the start of processing 
    8182sub init { 
    8283    my $self = shift (@_); 
     
    9293} 
    9394 
     95# called at the beginning of each plugin pass (import has one, buildin has many) 
    9496sub begin { 
     97    my $self = shift (@_); 
     98    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 
    9599 
    96100} 
    97101 
     102# called at the end of each plugin pass 
    98103sub end { 
     104    my ($self) = shift (@_); 
    99105 
    100106} 
    101107 
     108# called once, after all passes have finished 
    102109sub deinit { 
     110    my ($self) = @_; 
    103111 
    104112} 
  • gsdl/trunk/perllib/plugins/AutoExtractMetadata.pm

    r16698 r17026  
    8989    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 
    9090 
     91    $self->SUPER::begin(@_); 
     92 
    9193    #initialise those extractors that need initialisation 
    9294    $self->initialise_acronym_extractor(); 
    9395    $self->initialise_gis_extractor(); 
     96 
    9497} 
    9598 
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16997 r17026  
    7979 
    8080our $oidtype_list =  
    81     [ { 'name' => "hash", 
     81    [ { 'name' => "auto", 
     82    'desc' => "{BasePlugin.OIDtype.auto}" }, 
     83      { 'name' => "hash", 
    8284        'desc' => "{import.OIDtype.hash}" }, 
    8385      { 'name' => "assigned", 
     
    116118    'list' => $oidtype_list, 
    117119    # leave default empty so we can tell if its been set or not - if not set will use option from import.pl 
    118     #'deft' => "hash", 
     120    'deft' => "auto", 
    119121    'reqd' => "no", 
    120122    'modegli' => "2" }, 
     
    282284    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 
    283285 
     286    if ($self->{'OIDtype'} eq "auto") { 
     287    # hasn't been set in the plugin, use the processor values 
     288    $self->{'OIDtype'} = $processor->{'OIDtype'}; 
     289    $self->{'OIDmetadata'} = $processor->{'OIDmetadata'}; 
     290    } 
     291    if ($self->{'OIDtype'} eq "hash") { 
     292    # should we hash on the file or on the doc xml?? 
     293    $self->{'OIDtype'} = $self->get_oid_hash_type(); 
     294    if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) { 
     295        $self->{'OIDtype'} = "hash_on_file"; 
     296    } 
     297    } 
    284298} 
    285299 
     
    295309 
    296310    my ($self) = @_; 
     311} 
     312 
     313# default hashing type is to hash on the original file (or converted file) 
     314# override this to return hash_on_ga_xml for filetypes where hashing on the  
     315# file is no good eg video 
     316sub get_oid_hash_type { 
     317 
     318    my $self = shift (@_); 
     319 
     320    return "hash_on_file"; 
    297321} 
    298322 
     
    715739    my ($doc_obj, $filename_no_path, $file_encoding) = @_; 
    716740 
     741 
    717742    my $top_section = $doc_obj->get_top_section(); 
    718743     
     
    729754    $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta); 
    730755} 
    731       
     756    
     757# this should be called by all plugins to set the oid of the doc obj, rather 
     758# than calling doc_obj->set_OID directly 
    732759sub add_OID { 
    733760    my $self = shift (@_);   
    734761    my ($doc_obj) = @_; 
    735762 
    736     # See if a metadata field is specified as the field 
    737     if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) { 
    738     my $metadata_doc_id = $self->{'use_as_doc_identifier'}; 
    739  
    740     # Consider "tidying" up metadata_doc_id to be something 
    741     # suitable in a URL 
    742     # Could even support a user specified plugin RE for this. 
    743  
    744     my $top_section = $doc_obj->get_top_section(); 
    745     my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id); 
    746         $doc_obj->set_OID($oid); 
    747     } 
    748     # See if there is a plugin-specific set_OID function... 
    749     elsif (defined ($self->can('set_OID'))) { 
    750     # it will need $doc_obj to set the Identifier metadata... 
    751     $self->set_OID(@_); # pass through any extra arguments supplied 
    752     } else { 
     763    $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'}); 
     764 
     765    # see if there is a plugin specific set_OID function 
     766    if (defined ($self->can('set_OID'))) { 
     767    $self->set_OID(@_); # pass through doc_obj and any extra arguments 
     768    } 
     769    else { 
    753770    # use the default set_OID() in doc.pm 
    754771    $doc_obj->set_OID(); 
    755772    } 
    756 } 
    757  
    758  
    759  
     773 
     774} 
     775   
    760776# The BasePlugin read_into_doc_obj() function. This function does all the 
    761777# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in 
     
    791807    my $top_section = $doc_obj->get_top_section(); 
    792808 
    793     # this should look at the plugin option too... 
    794     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});     
    795809    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 
    796810    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
  • gsdl/trunk/perllib/plugins/BibTexPlugin.pm

    r16104 r17026  
    815815} 
    816816 
    817 sub set_OID { 
     817sub add_OID { 
    818818    my $self = shift (@_); 
    819819    my ($doc_obj, $id, $segment_number) = @_; 
    820820     
    821821    if ( $self->{'key'} eq "default") { 
    822     $doc_obj->set_OID("$id\_$segment_number"); 
     822    $self->SUPER::add_OID(@_); 
     823#   $doc_obj->set_OID("$id\_$segment_number"); 
    823824    } else { 
    824825    $doc_obj->set_OID($self->{'key'}); 
  • gsdl/trunk/perllib/plugins/ConvertToRogPlugin.pm

    r16580 r17026  
    344344    my $docnum = $self->{'docnum'}; 
    345345 
    346     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    347346    my ($filemeta) = $file =~ /([^\\\/]+)$/; 
    348347    $self->set_Source_metadata($doc_obj, $filemeta); 
     
    392391    $self->auto_extract_metadata ($doc_obj); 
    393392    # add an OID 
    394     $doc_obj->set_OID(); 
     393    $self->add_OID($doc_obj); 
    395394 
    396395    my $oid = $doc_obj->get_OID(); 
  • gsdl/trunk/perllib/plugins/DBPlugin.pm

    r16392 r17026  
    247247 
    248248    @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref? 
    249      
     249 
     250    my $base_oid = undef; 
    250251    while (scalar(@row_array)) { 
    251252    if (defined($dbplug_debug) && $dbplug_debug==1) { 
     
    255256    # create a new document 
    256257    my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 
    257     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
     258 
    258259    my $cursection = $doc_obj->get_top_section(); 
    259260 
     
    333334 
    334335    if (!defined $unique_id) { 
    335              $doc_obj->set_OID(); 
    336              my $id = $doc_obj->get_OID();      
    337          $doc_obj->set_OID($id."s$count"); 
     336        if (!defined $base_oid) { 
     337        $self->add_OID($doc_obj); 
     338        $base_oid = $doc_obj->get_OID();  
     339        } 
     340        $doc_obj->set_OID($base_oid."s$count"); 
    338341    } else { 
    339342        # use our id from the database... 
  • gsdl/trunk/perllib/plugins/EmailPlugin.pm

    r16677 r17026  
    8080} 
    8181 
     82my $extended_oidtype_list =  
     83    [ {'name' => "message_id", 
     84       'desc' => "{EmailPlugin.OIDtype.message_id}" } 
     85      ]; 
     86 
     87# add in all the standard options from BasePlugin 
     88push (@$extended_oidtype_list, @{$BasePlugin::oidtype_list}); 
    8289 
    8390my $arguments =  
     
    95102    'type' => "flag", 
    96103    'reqd' => "no" }, 
     104      { 'name' => "OIDtype", 
     105    'desc' => "{import.OIDtype}", 
     106    'type' => "enum", 
     107    'list' => $extended_oidtype_list, 
     108    'deft' => "messsage_id", 
     109    'reqd' => "no", 
     110    'modegli' => "2" }, 
     111      { 'name' => "OIDmetadata", 
     112    'desc' => "{import.OIDmetadata}", 
     113    'type' => "metadata", 
     114    'deft' => "dc.Identifier", 
     115    'reqd' => "no", 
     116    'modegli' => "2" }, 
    97117      { 'name' => "split_exp", 
    98118    'desc' => "{EmailPlugin.split_exp}", 
     
    11611181} 
    11621182 
    1163  
    1164 sub set_OID { 
     1183sub get_base_OID { 
     1184    my $self = shift(@_); 
     1185    my ($doc_obj) = @_; 
     1186 
     1187    ## TODO: need a valid id in case there is no message id... 
     1188    if ($self->{'OIDtype'} eq "message_id") { 
     1189    # temporarily set OIDtype to hash to get a base id 
     1190    $self->{'OIDtype'} = "hash_on_ga_xml"; 
     1191    $self->add_OID($doc_obj); 
     1192    my $id = $doc_obj->get_OID(); 
     1193    $self->{'OIDtype'} = "message_id"; 
     1194    return $id; 
     1195    } 
     1196    return $self->SUPER::get_base_OID(@_); 
     1197} 
     1198 
     1199 
     1200sub add_OID { 
    11651201    my $self = shift (@_); 
    11661202    my ($doc_obj, $id, $segment_number) = @_; 
    11671203     
    1168     if ( exists $doc_obj->{'msgid'} ) { 
     1204    if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) { 
    11691205    $doc_obj->set_OID($doc_obj->{'msgid'}); 
    1170     } else { 
     1206    } 
     1207    else { 
    11711208    $doc_obj->set_OID("$id\_$segment_number"); 
    11721209    } 
  • gsdl/trunk/perllib/plugins/FOXPlugin.pm

    r16392 r17026  
    270270    # create a new document 
    271271    my $doc_obj = new doc ($file, "indexed_doc"); 
    272     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
     272 
    273273    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    274274    my $section = $doc_obj->get_top_section(); 
     
    294294 
    295295    # add an object id 
    296     $doc_obj->set_OID(); 
     296    $self->add_OID($doc_obj); 
    297297 
    298298    # process the document 
  • gsdl/trunk/perllib/plugins/HBPlugin.pm

    r16392 r17026  
    241241    # create a new document 
    242242    my $doc_obj = new doc ($file, "indexed_doc"); 
    243     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    244243 
    245244    # copy the book cover if it exists 
     
    322321 
    323322    # add a OID 
    324     $doc_obj->set_OID (); 
     323    $self->add_OID($doc_obj); 
    325324 
    326325    # process the document 
  • gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm

    r16697 r17026  
    158158    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; 
    159159    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'}; 
     160 
     161    # reset the base id 
     162    $self->{'base_oid'} = undef; 
    160163  
    161164} 
     
    193196    my $file = $self->{'file'}; 
    194197    my $doc_obj = new doc($filename); 
    195     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    196198    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 
    197199    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
     
    211213        $self->{'doc_obj'} = $doc_obj;        
    212214    $self->{'num_processed'}++; 
     215    if (!defined $self->{'base_oid'}) { 
     216        $self->SUPER::add_OID($doc_obj); 
     217        $self->{'base_oid'} = $doc_obj->get_OID(); 
     218    } 
     219     
    213220 
    214221    } 
     
    309316     
    310317 
    311     $self->add_OID($doc_obj, $self->{'record_count'}); 
     318    $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'}); 
    312319 
    313320    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'}); 
     
    396403} 
    397404 
    398  
    399 sub set_OID { 
     405sub add_OID { 
    400406    my $self = shift (@_); 
    401     my ($doc_obj, $record_number) = @_; 
    402      
    403     # first set it to generate hash value 
    404     $doc_obj->set_OID();  
    405  
    406     # then top it up with an "r" + record-number suffix 
    407     my $id = $doc_obj->get_OID(); 
    408     $doc_obj->set_OID($id . "r" . $record_number); 
     407    my ($doc_obj, $id, $record_number) = @_; 
     408 
     409    my $full_id = $id . "r" . $record_number; 
     410    if ($self->{'OIDtype'} eq "assigned") { 
     411    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'}); 
     412    if (defined $identifier && $identifier ne "") { 
     413        $OID = $identifier; 
     414        $OID =~ s/\.//; #remove any periods 
     415        if ($OID =~ /^[\d]*$/) { 
     416        $OID = "D" . $OID; 
     417        print STDERR "OID only contains numbers, adding a D\n"; 
     418        } 
     419        $full_id = $identifier; 
     420    } 
     421    } 
     422    $doc_obj->set_OID($full_id); 
    409423} 
    410424 
  • gsdl/trunk/perllib/plugins/MP3Plugin.pm

    r16952 r17026  
    8585} 
    8686 
     87# we don't want to hash on the file 
     88sub get_oid_hash_type { 
     89    my $self = shift (@_); 
     90    return "hash_on_ga_xml"; 
     91} 
     92 
    8793sub process { 
    8894    my $self = shift (@_); 
     
    9096 
    9197    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    92     # do something about OIDtype so no hashing 
    93       
    94     # old code was in effect the following.  
    95     if ($doc_obj->{'OIDtype'} =~ /^hash$/) { 
    96     $doc_obj->set_OIDtype ("incremental"); 
    97     } 
    98  
    9998 
    10099    # associate the file with the document 
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r16392 r17026  
    239239         
    240240        # add an OID 
    241         $doc_obj->set_OID(); 
     241        $self->add_OID($doc_obj); 
    242242         
    243243        my $ppmd_table = $self->{'ppmd_table'}; 
  • gsdl/trunk/perllib/plugins/OggVorbisPlugin.pm

    r16960 r17026  
    8181} 
    8282 
     83# we don't want to hash on the file 
     84sub get_oid_hash_type { 
     85    my $self = shift (@_); 
     86    return "hash_on_ga_xml"; 
     87} 
     88 
    8389sub process 
    8490{ 
     
    8793 
    8894    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    89     # do something about OIDtype so no hashing 
    90       
    91     # old code was in effect the following.  
    92     if ($doc_obj->{'OIDtype'} =~ /^hash$/) { 
    93     $doc_obj->set_OIDtype ("incremental"); 
    94     } 
    9595 
    9696    my $top_section = $doc_obj->get_top_section(); 
  • gsdl/trunk/perllib/plugins/PagedImagePlugin.pm

    r16849 r17026  
    497497    my ($doc_obj, $filename_no_path, $processor) = @_; 
    498498 
    499     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    500499    my $topsection = $doc_obj->get_top_section(); 
    501500 
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r16765 r17026  
    137137 
    138138    # this should look at the plugin option too... 
    139     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});     
    140139    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 
    141140    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
  • gsdl/trunk/perllib/plugins/ReadXMLFile.pm

    r16822 r17026  
    199199} 
    200200 
    201  
    202 # we need to implement read cos we are not just using process_exp to determine 
    203 # whether to process this or not. 
    204201sub read { 
    205202    my $self = shift (@_);   
     
    363360    # create a new document 
    364361    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); 
    365     $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'}); 
    366362    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    367363 
     
    385381    
    386382    # add an OID 
    387     $self->add_OID(); 
     383    $self->add_OID($doc_obj); 
    388384     
    389385    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
  • gsdl/trunk/perllib/plugins/RealMediaPlugin.pm

    r16994 r17026  
    7777} 
    7878 
     79# we don't want to hash on the file 
     80sub get_oid_hash_type { 
     81    my $self = shift (@_); 
     82    return "hash_on_ga_xml"; 
     83} 
     84 
    7985sub process 
    8086{ 
     
    8490    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    8591    my $top_section = $doc_obj->get_top_section(); 
    86     # prevent hashing: old code was in effect the following.  
    87     if ($doc_obj->{'OIDtype'} =~ /^hash$/) { 
    88     $doc_obj->set_OIDtype ("incremental"); 
    89     } 
    9092 
    9193    my $text = ""; 
  • gsdl/trunk/perllib/plugins/RogPlugin.pm

    r16392 r17026  
    204204     
    205205    # add OID 
    206     $doc_obj->set_OID (); 
     206    $self->add_OID($doc_obj); 
    207207 
    208208    my $oid = $doc_obj->get_OID(); 
  • gsdl/trunk/perllib/plugins/SplitTextFile.pm

    r16700 r17026  
    234234    # create a new document 
    235235    my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 
    236     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    237236    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 
    238237    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
     
    248247    # Calculate a "base" document ID. 
    249248    if (!defined $id) { 
    250         $doc_obj->set_OID(); 
    251         $id = $doc_obj->get_OID(); 
     249        $id = $self->get_base_OID($doc_obj); 
    252250    } 
    253251     
     
    274272 
    275273    # add an OID 
    276     $self->set_OID($doc_obj, $id, $segment); 
     274    $self->add_OID($doc_obj, $id, $segment); 
    277275 
    278276    # process the document 
     
    288286} 
    289287 
    290 sub set_OID { 
     288sub get_base_OID { 
     289    my $self = shift(@_); 
     290    my ($doc_obj) = @_; 
     291 
     292    $self->SUPER::add_OID($doc_obj); 
     293    return $doc_obj->get_OID(); 
     294} 
     295 
     296sub add_OID { 
    291297    my $self = shift (@_); 
    292     my ($doc_obj, $id, $segment_number) = @_; 
    293      
    294     $doc_obj->set_OID($id . "s" . $segment_number); 
    295 } 
     298    my ($doc_obj, $id, $segment) = @_; 
     299 
     300    my $full_id = $id . "s" . $segment; 
     301    if ($self->{'OIDtype'} eq "assigned") { 
     302    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'}); 
     303    if (defined $identifier && $identifier ne "") { 
     304        $full_id = $identifier; 
     305        $full_id =~ s/\.//; #remove any periods 
     306        if ($full_id =~ /^[\d]*$/) { 
     307        $full_id = "D" . $full_id; 
     308        print STDERR "OID only contains numbers, adding a D\n"; 
     309        } 
     310    } 
     311    } 
     312    $doc_obj->set_OID($full_id); 
     313} 
     314 
    296315 
    2973161;