Changeset 17216

Show
Ignore:
Timestamp:
08.09.2008 14:59:50 (11 years ago)
Author:
kjdon
Message:

trying to get OAI files exploding. Have copied in some code from one of David's obsolete files. I think it works but haven't tested fully yet. Wanted to get the code committed though.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17197 r17216  
    3535use ReadXMLFile; 
    3636use ReadTextFile; # needed for subroutine textcat_get_language_encoding 
     37use metadatautil; 
    3738 
    3839sub BEGIN { 
     
    160161 
    161162 
     163sub metadata_read { 
     164    my $self = shift (@_);   
     165 
     166    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 
     167 
     168    # can we process this file?? 
     169    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     170    return undef unless $self->can_process_this_file($filename_full_path); 
     171     
     172    my $total_count = 0; # is total count used? 
     173    if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) { 
     174    # calling "SUPER::read" at this point sets up $metadata 
     175    # data-structure.  We can then, later, in OAIPlug::read decide 
     176    # whether this $metadata will stick to an accompanying file, 
     177    # or else needs a new doc object to be formed that contains 
     178    # purely metadata 
     179         
     180    $self->{'metadata'} = undef; 
     181     
     182    #my $url_array = $metadata->{'gi.Sourcedoc'}; 
     183    my $url_array = $metadata->{'dc.Identifier'}; 
     184    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 
     185 
     186    my $srcdoc_exists = 0; 
     187    my $srcdoc_pos = 0; 
     188    my $filename_dir = &util::filename_head($filename_full_path); 
     189         
     190    for (my $i=0; $i<$num_urls; $i++) { 
     191         
     192        if ($url_array->[$i] !~ m/^(https?|ftp):/) { 
     193         
     194        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 
     195         
     196        if (-e $src_filename) { 
     197            $srcdoc_pos = $i; 
     198            $srcdoc_exists = 1; 
     199            last; 
     200        } 
     201        } 
     202    } 
     203     
     204     
     205    if ($srcdoc_exists) 
     206    { 
     207        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 
     208         
     209###     print STDERR "**** storing OAI file: $file\n"; 
     210         
     211        # Make pretty print metadata table stick with src filename 
     212        my $ppmd_table = $self->{'ppmd_table'}; 
     213         
     214        $metadata->{'prettymd'} = [ $ppmd_table ]; 
     215        $self->{'ppmd_table'} = undef; 
     216         
     217    } 
     218    else { 
     219        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 
     220    } 
     221     
     222    } 
     223    else { 
     224    return undef; 
     225    } 
     226} 
    162227 
    163228 
    164229sub read { 
     230    my $self = shift (@_);   
     231   
     232    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
     233 
     234 
     235###    print STDERR "**** checking OAI read: $file\n"; 
     236 
     237    if (defined $self->{'oai-files'}->{$file}) { 
     238     
     239    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 
     240 
     241    # no more need to access details of this $file => tidy up as you go 
     242    delete $self->{'oai-files'}->{$file}; 
     243 
     244### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";  
     245    if (!$srcdoc_exists) 
     246    { 
     247 
     248        my $filename = $file; 
     249        $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     250 
     251        # Do encoding stuff on metadata 
     252        my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 
     253 
     254        # create a new document 
     255        my $doc_obj = new doc ($filename, "indexed_doc"); 
     256        my $top_section = $doc_obj->get_top_section; 
     257        my $plugin_type = $self->{'plugin_type'}; 
     258         
     259        $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
     260        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
     261        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 
     262        $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 
     263        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 
     264         
     265        # include any metadata passed in from previous plugins  
     266        # note that this metadata is associated with the top level section 
     267        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
     268         
     269        # do plugin specific processing of doc_obj 
     270        my $textref = \$self->{'rawxml'}; 
     271        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 
     272        print STDERR "<ProcessingError n='$file'>\n" if ($gli); 
     273        return -1; 
     274        } 
     275         
     276        # do any automatic metadata extraction 
     277        $self->auto_extract_metadata ($doc_obj); 
     278         
     279        # add an OID 
     280        $self->add_OID($doc_obj); 
     281         
     282        my $prettymds = $self->{'prettymd'}; 
     283        foreach my $prettymd (@$prettymds) { 
     284        $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd); 
     285        } 
     286        $self->{'prettymd'} = undef; 
     287         
     288        # process the document 
     289        $processor->process($doc_obj); 
     290         
     291        $self->{'num_processed'} ++; 
     292         
     293        return 1; # processed the file 
     294    } 
     295    } 
     296    else { 
     297    return undef; 
     298    } 
     299} 
     300 
     301 
     302sub read_old { 
    165303    my $self = shift (@_);   
    166304