Changeset 17026 for gsdl/trunk/perllib


Ignore:
Timestamp:
2008-08-27T20:40:20+12:00 (16 years ago)
Author:
kjdon
Message:

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)

Location:
gsdl/trunk/perllib/plugins
Files:
19 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/AbstractPlugin.pm

    r16388 r17026  
    7979}
    8080
     81# called once, at the start of processing
    8182sub init {
    8283    my $self = shift (@_);
     
    9293}
    9394
     95# called at the beginning of each plugin pass (import has one, buildin has many)
    9496sub begin {
     97    my $self = shift (@_);
     98    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
    9599
    96100}
    97101
     102# called at the end of each plugin pass
    98103sub end {
     104    my ($self) = shift (@_);
    99105
    100106}
    101107
     108# called once, after all passes have finished
    102109sub deinit {
     110    my ($self) = @_;
    103111
    104112}
  • gsdl/trunk/perllib/plugins/AutoExtractMetadata.pm

    r16698 r17026  
    8989    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
    9090
     91    $self->SUPER::begin(@_);
     92
    9193    #initialise those extractors that need initialisation
    9294    $self->initialise_acronym_extractor();
    9395    $self->initialise_gis_extractor();
     96
    9497}
    9598
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16997 r17026  
    7979
    8080our $oidtype_list =
    81     [ { 'name' => "hash",
     81    [ { 'name' => "auto",
     82    'desc' => "{BasePlugin.OIDtype.auto}" },
     83      { 'name' => "hash",
    8284        'desc' => "{import.OIDtype.hash}" },
    8385      { 'name' => "assigned",
     
    116118    'list' => $oidtype_list,
    117119    # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
    118     #'deft' => "hash",
     120    'deft' => "auto",
    119121    'reqd' => "no",
    120122    'modegli' => "2" },
     
    282284    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
    283285
     286    if ($self->{'OIDtype'} eq "auto") {
     287    # hasn't been set in the plugin, use the processor values
     288    $self->{'OIDtype'} = $processor->{'OIDtype'};
     289    $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
     290    }
     291    if ($self->{'OIDtype'} eq "hash") {
     292    # should we hash on the file or on the doc xml??
     293    $self->{'OIDtype'} = $self->get_oid_hash_type();
     294    if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
     295        $self->{'OIDtype'} = "hash_on_file";
     296    }
     297    }
    284298}
    285299
     
    295309
    296310    my ($self) = @_;
     311}
     312
     313# default hashing type is to hash on the original file (or converted file)
     314# override this to return hash_on_ga_xml for filetypes where hashing on the
     315# file is no good eg video
     316sub get_oid_hash_type {
     317
     318    my $self = shift (@_);
     319
     320    return "hash_on_file";
    297321}
    298322
     
    715739    my ($doc_obj, $filename_no_path, $file_encoding) = @_;
    716740
     741
    717742    my $top_section = $doc_obj->get_top_section();
    718743   
     
    729754    $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
    730755}
    731      
     756   
     757# this should be called by all plugins to set the oid of the doc obj, rather
     758# than calling doc_obj->set_OID directly
    732759sub add_OID {
    733760    my $self = shift (@_); 
    734761    my ($doc_obj) = @_;
    735762
    736     # See if a metadata field is specified as the field
    737     if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
    738     my $metadata_doc_id = $self->{'use_as_doc_identifier'};
    739 
    740     # Consider "tidying" up metadata_doc_id to be something
    741     # suitable in a URL
    742     # Could even support a user specified plugin RE for this.
    743 
    744     my $top_section = $doc_obj->get_top_section();
    745     my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
    746         $doc_obj->set_OID($oid);
    747     }
    748     # See if there is a plugin-specific set_OID function...
    749     elsif (defined ($self->can('set_OID'))) {
    750     # it will need $doc_obj to set the Identifier metadata...
    751     $self->set_OID(@_); # pass through any extra arguments supplied
    752     } else {
     763    $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
     764
     765    # see if there is a plugin specific set_OID function
     766    if (defined ($self->can('set_OID'))) {
     767    $self->set_OID(@_); # pass through doc_obj and any extra arguments
     768    }
     769    else {
    753770    # use the default set_OID() in doc.pm
    754771    $doc_obj->set_OID();
    755772    }
    756 }
    757 
    758 
    759 
     773
     774}
     775 
    760776# The BasePlugin read_into_doc_obj() function. This function does all the
    761777# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
     
    791807    my $top_section = $doc_obj->get_top_section();
    792808
    793     # this should look at the plugin option too...
    794     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});   
    795809    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
    796810    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
  • gsdl/trunk/perllib/plugins/BibTexPlugin.pm

    r16104 r17026  
    815815}
    816816
    817 sub set_OID {
     817sub add_OID {
    818818    my $self = shift (@_);
    819819    my ($doc_obj, $id, $segment_number) = @_;
    820820   
    821821    if ( $self->{'key'} eq "default") {
    822     $doc_obj->set_OID("$id\_$segment_number");
     822    $self->SUPER::add_OID(@_);
     823#   $doc_obj->set_OID("$id\_$segment_number");
    823824    } else {
    824825    $doc_obj->set_OID($self->{'key'});
  • gsdl/trunk/perllib/plugins/ConvertToRogPlugin.pm

    r16580 r17026  
    344344    my $docnum = $self->{'docnum'};
    345345
    346     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    347346    my ($filemeta) = $file =~ /([^\\\/]+)$/;
    348347    $self->set_Source_metadata($doc_obj, $filemeta);
     
    392391    $self->auto_extract_metadata ($doc_obj);
    393392    # add an OID
    394     $doc_obj->set_OID();
     393    $self->add_OID($doc_obj);
    395394
    396395    my $oid = $doc_obj->get_OID();
  • gsdl/trunk/perllib/plugins/DBPlugin.pm

    r16392 r17026  
    247247
    248248    @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
    249    
     249
     250    my $base_oid = undef;
    250251    while (scalar(@row_array)) {
    251252    if (defined($dbplug_debug) && $dbplug_debug==1) {
     
    255256    # create a new document
    256257    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
    257     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
     258
    258259    my $cursection = $doc_obj->get_top_section();
    259260
     
    333334
    334335    if (!defined $unique_id) {
    335              $doc_obj->set_OID();
    336              my $id = $doc_obj->get_OID();     
    337          $doc_obj->set_OID($id."s$count");
     336        if (!defined $base_oid) {
     337        $self->add_OID($doc_obj);
     338        $base_oid = $doc_obj->get_OID();
     339        }
     340        $doc_obj->set_OID($base_oid."s$count");
    338341    } else {
    339342        # use our id from the database...
  • gsdl/trunk/perllib/plugins/EmailPlugin.pm

    r16677 r17026  
    8080}
    8181
     82my $extended_oidtype_list =
     83    [ {'name' => "message_id",
     84       'desc' => "{EmailPlugin.OIDtype.message_id}" }
     85      ];
     86
     87# add in all the standard options from BasePlugin
     88push (@$extended_oidtype_list, @{$BasePlugin::oidtype_list});
    8289
    8390my $arguments =
     
    95102    'type' => "flag",
    96103    'reqd' => "no" },
     104      { 'name' => "OIDtype",
     105    'desc' => "{import.OIDtype}",
     106    'type' => "enum",
     107    'list' => $extended_oidtype_list,
     108    'deft' => "messsage_id",
     109    'reqd' => "no",
     110    'modegli' => "2" },
     111      { 'name' => "OIDmetadata",
     112    'desc' => "{import.OIDmetadata}",
     113    'type' => "metadata",
     114    'deft' => "dc.Identifier",
     115    'reqd' => "no",
     116    'modegli' => "2" },
    97117      { 'name' => "split_exp",
    98118    'desc' => "{EmailPlugin.split_exp}",
     
    11611181}
    11621182
    1163 
    1164 sub set_OID {
     1183sub get_base_OID {
     1184    my $self = shift(@_);
     1185    my ($doc_obj) = @_;
     1186
     1187    ## TODO: need a valid id in case there is no message id...
     1188    if ($self->{'OIDtype'} eq "message_id") {
     1189    # temporarily set OIDtype to hash to get a base id
     1190    $self->{'OIDtype'} = "hash_on_ga_xml";
     1191    $self->add_OID($doc_obj);
     1192    my $id = $doc_obj->get_OID();
     1193    $self->{'OIDtype'} = "message_id";
     1194    return $id;
     1195    }
     1196    return $self->SUPER::get_base_OID(@_);
     1197}
     1198
     1199
     1200sub add_OID {
    11651201    my $self = shift (@_);
    11661202    my ($doc_obj, $id, $segment_number) = @_;
    11671203   
    1168     if ( exists $doc_obj->{'msgid'} ) {
     1204    if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
    11691205    $doc_obj->set_OID($doc_obj->{'msgid'});
    1170     } else {
     1206    }
     1207    else {
    11711208    $doc_obj->set_OID("$id\_$segment_number");
    11721209    }
  • gsdl/trunk/perllib/plugins/FOXPlugin.pm

    r16392 r17026  
    270270    # create a new document
    271271    my $doc_obj = new doc ($file, "indexed_doc");
    272     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
     272
    273273    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    274274    my $section = $doc_obj->get_top_section();
     
    294294
    295295    # add an object id
    296     $doc_obj->set_OID();
     296    $self->add_OID($doc_obj);
    297297
    298298    # process the document
  • gsdl/trunk/perllib/plugins/HBPlugin.pm

    r16392 r17026  
    241241    # create a new document
    242242    my $doc_obj = new doc ($file, "indexed_doc");
    243     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    244243
    245244    # copy the book cover if it exists
     
    322321
    323322    # add a OID
    324     $doc_obj->set_OID ();
     323    $self->add_OID($doc_obj);
    325324
    326325    # process the document
  • gsdl/trunk/perllib/plugins/MARCXMLPlugin.pm

    r16697 r17026  
    158158    print $outhandle "MARCXMLPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
    159159    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlugin'>\n" if $self->{'gli'};
     160
     161    # reset the base id
     162    $self->{'base_oid'} = undef;
    160163 
    161164}
     
    193196    my $file = $self->{'file'};
    194197    my $doc_obj = new doc($filename);
    195     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    196198    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    197199    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
     
    211213        $self->{'doc_obj'} = $doc_obj;       
    212214    $self->{'num_processed'}++;
     215    if (!defined $self->{'base_oid'}) {
     216        $self->SUPER::add_OID($doc_obj);
     217        $self->{'base_oid'} = $doc_obj->get_OID();
     218    }
     219   
    213220
    214221    }
     
    309316   
    310317
    311     $self->add_OID($doc_obj, $self->{'record_count'});
     318    $self->add_OID($doc_obj, $self->{'base_oid'}, $self->{'record_count'});
    312319
    313320    $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
     
    396403}
    397404
    398 
    399 sub set_OID {
     405sub add_OID {
    400406    my $self = shift (@_);
    401     my ($doc_obj, $record_number) = @_;
    402    
    403     # first set it to generate hash value
    404     $doc_obj->set_OID();
    405 
    406     # then top it up with an "r" + record-number suffix
    407     my $id = $doc_obj->get_OID();
    408     $doc_obj->set_OID($id . "r" . $record_number);
     407    my ($doc_obj, $id, $record_number) = @_;
     408
     409    my $full_id = $id . "r" . $record_number;
     410    if ($self->{'OIDtype'} eq "assigned") {
     411    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
     412    if (defined $identifier && $identifier ne "") {
     413        $OID = $identifier;
     414        $OID =~ s/\.//; #remove any periods
     415        if ($OID =~ /^[\d]*$/) {
     416        $OID = "D" . $OID;
     417        print STDERR "OID only contains numbers, adding a D\n";
     418        }
     419        $full_id = $identifier;
     420    }
     421    }
     422    $doc_obj->set_OID($full_id);
    409423}
    410424
  • gsdl/trunk/perllib/plugins/MP3Plugin.pm

    r16952 r17026  
    8585}
    8686
     87# we don't want to hash on the file
     88sub get_oid_hash_type {
     89    my $self = shift (@_);
     90    return "hash_on_ga_xml";
     91}
     92
    8793sub process {
    8894    my $self = shift (@_);
     
    9096
    9197    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    92     # do something about OIDtype so no hashing
    93      
    94     # old code was in effect the following.
    95     if ($doc_obj->{'OIDtype'} =~ /^hash$/) {
    96     $doc_obj->set_OIDtype ("incremental");
    97     }
    98 
    9998
    10099    # associate the file with the document
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r16392 r17026  
    239239       
    240240        # add an OID
    241         $doc_obj->set_OID();
     241        $self->add_OID($doc_obj);
    242242       
    243243        my $ppmd_table = $self->{'ppmd_table'};
  • gsdl/trunk/perllib/plugins/OggVorbisPlugin.pm

    r16960 r17026  
    8181}
    8282
     83# we don't want to hash on the file
     84sub get_oid_hash_type {
     85    my $self = shift (@_);
     86    return "hash_on_ga_xml";
     87}
     88
    8389sub process
    8490{
     
    8793
    8894    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    89     # do something about OIDtype so no hashing
    90      
    91     # old code was in effect the following.
    92     if ($doc_obj->{'OIDtype'} =~ /^hash$/) {
    93     $doc_obj->set_OIDtype ("incremental");
    94     }
    9595
    9696    my $top_section = $doc_obj->get_top_section();
  • gsdl/trunk/perllib/plugins/PagedImagePlugin.pm

    r16849 r17026  
    497497    my ($doc_obj, $filename_no_path, $processor) = @_;
    498498
    499     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    500499    my $topsection = $doc_obj->get_top_section();
    501500
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r16765 r17026  
    137137
    138138    # this should look at the plugin option too...
    139     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});   
    140139    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
    141140    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
  • gsdl/trunk/perllib/plugins/ReadXMLFile.pm

    r16822 r17026  
    199199}
    200200
    201 
    202 # we need to implement read cos we are not just using process_exp to determine
    203 # whether to process this or not.
    204201sub read {
    205202    my $self = shift (@_); 
     
    363360    # create a new document
    364361    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
    365     $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
    366362    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    367363
     
    385381   
    386382    # add an OID
    387     $self->add_OID();
     383    $self->add_OID($doc_obj);
    388384   
    389385    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
  • gsdl/trunk/perllib/plugins/RealMediaPlugin.pm

    r16994 r17026  
    7777}
    7878
     79# we don't want to hash on the file
     80sub get_oid_hash_type {
     81    my $self = shift (@_);
     82    return "hash_on_ga_xml";
     83}
     84
    7985sub process
    8086{
     
    8490    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    8591    my $top_section = $doc_obj->get_top_section();
    86     # prevent hashing: old code was in effect the following.
    87     if ($doc_obj->{'OIDtype'} =~ /^hash$/) {
    88     $doc_obj->set_OIDtype ("incremental");
    89     }
    9092
    9193    my $text = "";
  • gsdl/trunk/perllib/plugins/RogPlugin.pm

    r16392 r17026  
    204204   
    205205    # add OID
    206     $doc_obj->set_OID ();
     206    $self->add_OID($doc_obj);
    207207
    208208    my $oid = $doc_obj->get_OID();
  • gsdl/trunk/perllib/plugins/SplitTextFile.pm

    r16700 r17026  
    234234    # create a new document
    235235    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
    236     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    237236    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    238237    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
     
    248247    # Calculate a "base" document ID.
    249248    if (!defined $id) {
    250         $doc_obj->set_OID();
    251         $id = $doc_obj->get_OID();
     249        $id = $self->get_base_OID($doc_obj);
    252250    }
    253251   
     
    274272
    275273    # add an OID
    276     $self->set_OID($doc_obj, $id, $segment);
     274    $self->add_OID($doc_obj, $id, $segment);
    277275
    278276    # process the document
     
    288286}
    289287
    290 sub set_OID {
     288sub get_base_OID {
     289    my $self = shift(@_);
     290    my ($doc_obj) = @_;
     291
     292    $self->SUPER::add_OID($doc_obj);
     293    return $doc_obj->get_OID();
     294}
     295
     296sub add_OID {
    291297    my $self = shift (@_);
    292     my ($doc_obj, $id, $segment_number) = @_;
    293    
    294     $doc_obj->set_OID($id . "s" . $segment_number);
    295 }
     298    my ($doc_obj, $id, $segment) = @_;
     299
     300    my $full_id = $id . "s" . $segment;
     301    if ($self->{'OIDtype'} eq "assigned") {
     302    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
     303    if (defined $identifier && $identifier ne "") {
     304        $full_id = $identifier;
     305        $full_id =~ s/\.//; #remove any periods
     306        if ($full_id =~ /^[\d]*$/) {
     307        $full_id = "D" . $full_id;
     308        print STDERR "OID only contains numbers, adding a D\n";
     309        }
     310    }
     311    }
     312    $doc_obj->set_OID($full_id);
     313}
     314
    296315
    2973161;
Note: See TracChangeset for help on using the changeset viewer.