Changeset 17216


Ignore:
Timestamp:
2008-09-08T14:59:50+12:00 (16 years ago)
Author:
kjdon
Message:

trying to get OAI files exploding. Have copied in some code from one of David's obsolete files. I think it works but haven't tested fully yet. Wanted to get the code committed though.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17197 r17216  
    3535use ReadXMLFile;
    3636use ReadTextFile; # needed for subroutine textcat_get_language_encoding
     37use metadatautil;
    3738
    3839sub BEGIN {
     
    160161
    161162
     163sub metadata_read {
     164    my $self = shift (@_); 
     165
     166    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     167
     168    # can we process this file??
     169    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     170    return undef unless $self->can_process_this_file($filename_full_path);
     171   
     172    my $total_count = 0; # is total count used?
     173    if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) {
     174    # calling "SUPER::read" at this point sets up $metadata
     175    # data-structure.  We can then, later, in OAIPlug::read decide
     176    # whether this $metadata will stick to an accompanying file,
     177    # or else needs a new doc object to be formed that contains
     178    # purely metadata
     179       
     180    $self->{'metadata'} = undef;
     181   
     182    #my $url_array = $metadata->{'gi.Sourcedoc'};
     183    my $url_array = $metadata->{'dc.Identifier'};
     184    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
     185
     186    my $srcdoc_exists = 0;
     187    my $srcdoc_pos = 0;
     188    my $filename_dir = &util::filename_head($filename_full_path);
     189       
     190    for (my $i=0; $i<$num_urls; $i++) {
     191       
     192        if ($url_array->[$i] !~ m/^(https?|ftp):/) {
     193       
     194        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
     195       
     196        if (-e $src_filename) {
     197            $srcdoc_pos = $i;
     198            $srcdoc_exists = 1;
     199            last;
     200        }
     201        }
     202    }
     203   
     204   
     205    if ($srcdoc_exists)
     206    {
     207        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
     208       
     209###     print STDERR "**** storing OAI file: $file\n";
     210       
     211        # Make pretty print metadata table stick with src filename
     212        my $ppmd_table = $self->{'ppmd_table'};
     213       
     214        $metadata->{'prettymd'} = [ $ppmd_table ];
     215        $self->{'ppmd_table'} = undef;
     216       
     217    }
     218    else {
     219        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
     220    }
     221   
     222    }
     223    else {
     224    return undef;
     225    }
     226}
    162227
    163228
    164229sub read {
     230    my $self = shift (@_); 
     231 
     232    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     233
     234
     235###    print STDERR "**** checking OAI read: $file\n";
     236
     237    if (defined $self->{'oai-files'}->{$file}) {
     238   
     239    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
     240
     241    # no more need to access details of this $file => tidy up as you go
     242    delete $self->{'oai-files'}->{$file};
     243
     244### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";
     245    if (!$srcdoc_exists)
     246    {
     247
     248        my $filename = $file;
     249        $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     250
     251        # Do encoding stuff on metadata
     252        my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
     253
     254        # create a new document
     255        my $doc_obj = new doc ($filename, "indexed_doc");
     256        my $top_section = $doc_obj->get_top_section;
     257        my $plugin_type = $self->{'plugin_type'};
     258       
     259        $doc_obj->add_utf8_metadata($top_section, "Language", $language);
     260        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     261        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
     262        $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
     263        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
     264       
     265        # include any metadata passed in from previous plugins
     266        # note that this metadata is associated with the top level section
     267        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     268       
     269        # do plugin specific processing of doc_obj
     270        my $textref = \$self->{'rawxml'};
     271        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
     272        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
     273        return -1;
     274        }
     275       
     276        # do any automatic metadata extraction
     277        $self->auto_extract_metadata ($doc_obj);
     278       
     279        # add an OID
     280        $self->add_OID($doc_obj);
     281       
     282        my $prettymds = $self->{'prettymd'};
     283        foreach my $prettymd (@$prettymds) {
     284        $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd);
     285        }
     286        $self->{'prettymd'} = undef;
     287       
     288        # process the document
     289        $processor->process($doc_obj);
     290       
     291        $self->{'num_processed'} ++;
     292       
     293        return 1; # processed the file
     294    }
     295    }
     296    else {
     297    return undef;
     298    }
     299}
     300
     301
     302sub read_old {
    165303    my $self = shift (@_); 
    166304 
Note: See TracChangeset for help on using the changeset viewer.