Ignore:
Timestamp:
2008-09-15T15:28:48+12:00 (16 years ago)
Author:
kjdon
Message:

previous changes to get exploding working (using metadata_read) meant that general coll building with this plugin stopped working :-(. So, I think I have fixed that, but it will have broken exploding in the meantime. I'll fix that next, but am going home now

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17216 r17290  
    4747    'type' => "regexp",
    4848    'reqd' => "no",
    49     'deft' => &get_default_process_exp() }
     49    'deft' => &get_default_process_exp() },
     50      { 'name' => "xxx",
     51    'desc' => "{OAIPlugin.xxx}",
     52    'type' => "metadata",
     53    'reqd' => "no",
     54    'deft' => "gi.Sourcedoc" }
    5055      ];
    5156
     
    8893    $self->{'in_metadata_node'} = 0;
    8994    $self->{'rawxml'} = "";
     95    $self->{'saved_metadata'} = {};
    9096}
    9197
     
    140146    if ($element eq "metadata") {
    141147    my $textref = \$self->{'metadata_xml'};
    142     my $metadata = $self->{'metadata'};
     148    #my $metadata = $self->{'metadata'};
     149    my $metadata = $self->{'saved_metadata'};
    143150    $self->extract_oai_metadata($textref,$metadata);
    144151
     
    169176    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    170177    return undef unless $self->can_process_this_file($filename_full_path);
     178#    print STDERR "initial\n";
     179#   foreach my $k (keys %$metadata) {
     180#   print STDERR "$k=".join (", ", @{$metadata->{$k}})."; ";
     181#    }
     182#    print STDERR "\n";
    171183   
    172184    my $total_count = 0; # is total count used?
    173     if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) {
    174     # calling "SUPER::read" at this point sets up $metadata
    175     # data-structure.  We can then, later, in OAIPlug::read decide
    176     # whether this $metadata will stick to an accompanying file,
    177     # or else needs a new doc object to be formed that contains
    178     # purely metadata
    179        
    180     $self->{'metadata'} = undef;
     185    if (!$self->parse_file($filename_full_path, $file, $gli)) {
     186    $self->{'saved_metadata'} = undef;
     187    return undef;
     188    }
     189
     190    my $new_metadata = $self->{'saved_metadata'};
     191    $self->{'saved_metadata'} = undef;
     192    # add the pretty metadata table as metadata
     193    my $ppmd_table = $self->{'ppmd_table'};
     194    $new_metadata->{'prettymd'} = $ppmd_table;
     195    $self->{'ppmd_table'} = undef;
     196   
     197    print STDERR "after parse\n";
     198    foreach my $k (keys %$new_metadata) {
     199    print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; ";
     200    }
     201    print STDERR "\n";
     202   
     203   
     204    #   if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) {
     205    # calling "SUPER::read" at this point sets up $metadata
     206    # data-structure.  We can then, later, in OAIPlug::read decide
     207    # whether this $metadata will stick to an accompanying file,
     208    # or else needs a new doc object to be formed that contains
     209    # purely metadata
     210   
     211#   $self->{'metadata'} = undef;
     212#   print STDERR "after erad\n";
     213#   foreach my $k (keys %$metadata) {
     214#       print STDERR "$k=".join (", ", @{$metadata->{$k}})."; ";
     215#   }
     216#   print STDERR "\n";
     217    my $url_array = $new_metadata->{'dc.Identifier'};
     218    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
     219    print STDERR "$num_urls urls for $file\n";
     220    my $srcdoc_exists = 0;
     221    my $srcdoc_pos = 0;
     222    my $filename_dir = &util::filename_head($filename_full_path);
     223    my $filename_for_metadata = $file;
     224    for (my $i=0; $i<$num_urls; $i++) {
    181225   
    182     #my $url_array = $metadata->{'gi.Sourcedoc'};
    183     my $url_array = $metadata->{'dc.Identifier'};
    184     my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
    185 
    186     my $srcdoc_exists = 0;
    187     my $srcdoc_pos = 0;
    188     my $filename_dir = &util::filename_head($filename_full_path);
    189        
    190     for (my $i=0; $i<$num_urls; $i++) {
    191        
    192         if ($url_array->[$i] !~ m/^(https?|ftp):/) {
    193        
    194         my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
    195        
    196         if (-e $src_filename) {
    197             $srcdoc_pos = $i;
    198             $srcdoc_exists = 1;
    199             last;
    200         }
     226    if ($url_array->[$i] !~ m/^(https?|ftp):/) {
     227       
     228        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
     229       
     230        if (-e $src_filename) {
     231        $srcdoc_pos = $i;
     232        $srcdoc_exists = 1;
     233        $filename_for_metadata = $url_array->[$i];
     234        last;
    201235        }
    202236    }
    203    
    204    
    205     if ($srcdoc_exists)
    206     {
    207         $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
    208        
     237    }
     238   
     239   
     240    if ($srcdoc_exists)
     241    {
     242    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
     243    }
     244    else {
     245    # save the rawxml for the source document
     246    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
     247    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
     248    $self->{'rawxml'} = "";
     249    print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n";
     250    }
     251   
    209252###     print STDERR "**** storing OAI file: $file\n";
    210        
    211         # Make pretty print metadata table stick with src filename
    212         my $ppmd_table = $self->{'ppmd_table'};
    213        
    214         $metadata->{'prettymd'} = [ $ppmd_table ];
    215         $self->{'ppmd_table'} = undef;
    216        
    217     }
    218     else {
    219         $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
    220     }
    221    
    222     }
    223     else {
    224     return undef;
    225     }
     253   
     254    # return all the metadata we have extracted to the caller.
     255    # Directory plug will pass it back in at read time, so we don't need to extract it again.
     256    $extrametadata->{$filename_for_metadata} = $new_metadata;
     257    push(@$extrametakeys, $filename_for_metadata);
     258
     259    return 1;
     260   
    226261}
    227262
     
    229264sub read {
    230265    my $self = shift (@_); 
    231  
     266   
    232267    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    233268
     
    235270###    print STDERR "**** checking OAI read: $file\n";
    236271
    237     if (defined $self->{'oai-files'}->{$file}) {
    238    
    239     my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
    240 
     272    if (!defined $self->{'oai-files'}->{$file}) {
     273    return undef;
     274    }
     275   
     276   
     277    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
     278    if ($srcdoc_exists) {
     279    # do nothing more
    241280    # no more need to access details of this $file => tidy up as you go
    242281    delete $self->{'oai-files'}->{$file};
     282    return 0; # not processed here, but don't pass on to rest of plugins
     283    }
    243284
    244285### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";
    245     if (!$srcdoc_exists)
    246     {
    247 
    248         my $filename = $file;
    249         $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    250 
    251         # Do encoding stuff on metadata
    252         my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    253 
    254         # create a new document
    255         my $doc_obj = new doc ($filename, "indexed_doc");
    256         my $top_section = $doc_obj->get_top_section;
    257         my $plugin_type = $self->{'plugin_type'};
    258        
    259         $doc_obj->add_utf8_metadata($top_section, "Language", $language);
    260         $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
    261         $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
    262         $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
    263         $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
    264        
    265         # include any metadata passed in from previous plugins
    266         # note that this metadata is associated with the top level section
    267         $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    268        
    269         # do plugin specific processing of doc_obj
    270         my $textref = \$self->{'rawxml'};
    271         unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
    272         print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    273         return -1;
    274         }
    275        
    276         # do any automatic metadata extraction
    277         $self->auto_extract_metadata ($doc_obj);
    278        
    279         # add an OID
    280         $self->add_OID($doc_obj);
    281        
    282         my $prettymds = $self->{'prettymd'};
    283         foreach my $prettymd (@$prettymds) {
    284         $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd);
    285         }
    286         $self->{'prettymd'} = undef;
    287        
    288         # process the document
    289         $processor->process($doc_obj);
    290        
    291         $self->{'num_processed'} ++;
    292        
    293         return 1; # processed the file
    294     }
    295     }
    296     else {
    297     return undef;
    298     }
     286   
     287    my $filename = $file;
     288    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     289
     290    # Do encoding stuff on metadata
     291    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
     292
     293    # create a new document
     294    my $doc_obj = new doc ($filename, "indexed_doc");
     295    my $top_section = $doc_obj->get_top_section;
     296    my $plugin_type = $self->{'plugin_type'};
     297   
     298    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
     299    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     300    $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
     301    $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
     302    $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
     303   
     304    # include any metadata passed in from previous plugins
     305    # note that this metadata is associated with the top level section
     306    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     307   
     308    # do plugin specific processing of doc_obj
     309    print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n";
     310    my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
     311    delete $self->{'oai-files'}->{$file};
     312
     313    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
     314    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
     315    return -1;
     316    }
     317   
     318    # do any automatic metadata extraction
     319    $self->auto_extract_metadata ($doc_obj);
     320   
     321    # add an OID
     322    $self->add_OID($doc_obj);
     323       
     324    # process the document
     325    $processor->process($doc_obj);
     326   
     327    $self->{'num_processed'} ++;
     328   
     329    return 1; # processed the file
    299330}
    300331
     
    302333sub read_old {
    303334    my $self = shift (@_); 
    304  
     335   
    305336    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    306337
     
    436467    my $style = "style=\'border-bottom: 4px solid #000080\'";
    437468
    438     $self->{'ppmd_table'} = "\n<table $att $style>";
     469    $self->{'ppmd_table'} = "\n<table $att $style>";
    439470}
    440471
Note: See TracChangeset for help on using the changeset viewer.