Changeset 17290

Show
Ignore:
Timestamp:
15.09.2008 15:28:48 (11 years ago)
Author:
kjdon
Message:

previous changes to get exploding working (using metadata_read) meant that general coll building with this plugin stopped working :-(. So, I think I have fixed that, but it will have broken exploding in the meantime. I'll fix that next, but am going home now

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17216 r17290  
    4747    'type' => "regexp", 
    4848    'reqd' => "no", 
    49     'deft' => &get_default_process_exp() } 
     49    'deft' => &get_default_process_exp() }, 
     50      { 'name' => "xxx", 
     51    'desc' => "{OAIPlugin.xxx}", 
     52    'type' => "metadata", 
     53    'reqd' => "no", 
     54    'deft' => "gi.Sourcedoc" } 
    5055      ]; 
    5156 
     
    8893    $self->{'in_metadata_node'} = 0; 
    8994    $self->{'rawxml'} = ""; 
     95    $self->{'saved_metadata'} = {}; 
    9096} 
    9197 
     
    140146    if ($element eq "metadata") { 
    141147    my $textref = \$self->{'metadata_xml'}; 
    142     my $metadata = $self->{'metadata'}; 
     148    #my $metadata = $self->{'metadata'}; 
     149    my $metadata = $self->{'saved_metadata'}; 
    143150    $self->extract_oai_metadata($textref,$metadata); 
    144151 
     
    169176    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    170177    return undef unless $self->can_process_this_file($filename_full_path); 
     178#    print STDERR "initial\n"; 
     179#   foreach my $k (keys %$metadata) { 
     180#   print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 
     181#    } 
     182#    print STDERR "\n"; 
    171183     
    172184    my $total_count = 0; # is total count used? 
    173     if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) { 
    174     # calling "SUPER::read" at this point sets up $metadata 
    175     # data-structure.  We can then, later, in OAIPlug::read decide 
    176     # whether this $metadata will stick to an accompanying file, 
    177     # or else needs a new doc object to be formed that contains 
    178     # purely metadata 
    179          
    180     $self->{'metadata'} = undef; 
     185    if (!$self->parse_file($filename_full_path, $file, $gli)) { 
     186    $self->{'saved_metadata'} = undef; 
     187    return undef; 
     188    } 
     189 
     190    my $new_metadata = $self->{'saved_metadata'}; 
     191    $self->{'saved_metadata'} = undef; 
     192    # add the pretty metadata table as metadata 
     193    my $ppmd_table = $self->{'ppmd_table'}; 
     194    $new_metadata->{'prettymd'} = $ppmd_table; 
     195    $self->{'ppmd_table'} = undef; 
     196     
     197    print STDERR "after parse\n"; 
     198    foreach my $k (keys %$new_metadata) { 
     199    print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; "; 
     200    } 
     201    print STDERR "\n"; 
     202     
     203     
     204    #   if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) { 
     205    # calling "SUPER::read" at this point sets up $metadata 
     206    # data-structure.  We can then, later, in OAIPlug::read decide 
     207    # whether this $metadata will stick to an accompanying file, 
     208    # or else needs a new doc object to be formed that contains 
     209    # purely metadata 
     210     
     211#   $self->{'metadata'} = undef; 
     212#   print STDERR "after erad\n"; 
     213#   foreach my $k (keys %$metadata) { 
     214#       print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 
     215#   } 
     216#   print STDERR "\n"; 
     217    my $url_array = $new_metadata->{'dc.Identifier'}; 
     218    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 
     219    print STDERR "$num_urls urls for $file\n"; 
     220    my $srcdoc_exists = 0; 
     221    my $srcdoc_pos = 0; 
     222    my $filename_dir = &util::filename_head($filename_full_path); 
     223    my $filename_for_metadata = $file; 
     224    for (my $i=0; $i<$num_urls; $i++) { 
    181225     
    182     #my $url_array = $metadata->{'gi.Sourcedoc'}; 
    183     my $url_array = $metadata->{'dc.Identifier'}; 
    184     my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 
    185  
    186     my $srcdoc_exists = 0; 
    187     my $srcdoc_pos = 0; 
    188     my $filename_dir = &util::filename_head($filename_full_path); 
    189          
    190     for (my $i=0; $i<$num_urls; $i++) { 
    191          
    192         if ($url_array->[$i] !~ m/^(https?|ftp):/) { 
    193          
    194         my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 
    195          
    196         if (-e $src_filename) { 
    197             $srcdoc_pos = $i; 
    198             $srcdoc_exists = 1; 
    199             last; 
    200         } 
     226    if ($url_array->[$i] !~ m/^(https?|ftp):/) { 
     227         
     228        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 
     229         
     230        if (-e $src_filename) { 
     231        $srcdoc_pos = $i; 
     232        $srcdoc_exists = 1; 
     233        $filename_for_metadata = $url_array->[$i]; 
     234        last; 
    201235        } 
    202236    } 
    203      
    204      
    205     if ($srcdoc_exists) 
    206     { 
    207         $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 
    208          
     237    } 
     238     
     239     
     240    if ($srcdoc_exists) 
     241    { 
     242    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 
     243    } 
     244    else { 
     245    # save the rawxml for the source document 
     246    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 
     247    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'}; 
     248    $self->{'rawxml'} = ""; 
     249    print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 
     250    } 
     251     
    209252###     print STDERR "**** storing OAI file: $file\n"; 
    210          
    211         # Make pretty print metadata table stick with src filename 
    212         my $ppmd_table = $self->{'ppmd_table'}; 
    213          
    214         $metadata->{'prettymd'} = [ $ppmd_table ]; 
    215         $self->{'ppmd_table'} = undef; 
    216          
    217     } 
    218     else { 
    219         $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 
    220     } 
    221      
    222     } 
    223     else { 
    224     return undef; 
    225     } 
     253     
     254    # return all the metadata we have extracted to the caller. 
     255    # Directory plug will pass it back in at read time, so we don't need to extract it again. 
     256    $extrametadata->{$filename_for_metadata} = $new_metadata; 
     257    push(@$extrametakeys, $filename_for_metadata); 
     258 
     259    return 1; 
     260     
    226261} 
    227262 
     
    229264sub read { 
    230265    my $self = shift (@_);   
    231    
     266     
    232267    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    233268 
     
    235270###    print STDERR "**** checking OAI read: $file\n"; 
    236271 
    237     if (defined $self->{'oai-files'}->{$file}) { 
    238      
    239     my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 
    240  
     272    if (!defined $self->{'oai-files'}->{$file}) { 
     273    return undef; 
     274    } 
     275     
     276     
     277    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 
     278    if ($srcdoc_exists) { 
     279    # do nothing more 
    241280    # no more need to access details of this $file => tidy up as you go 
    242281    delete $self->{'oai-files'}->{$file}; 
     282    return 0; # not processed here, but don't pass on to rest of plugins 
     283    } 
    243284 
    244285### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";  
    245     if (!$srcdoc_exists) 
    246     { 
    247  
    248         my $filename = $file; 
    249         $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
    250  
    251         # Do encoding stuff on metadata 
    252         my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 
    253  
    254         # create a new document 
    255         my $doc_obj = new doc ($filename, "indexed_doc"); 
    256         my $top_section = $doc_obj->get_top_section; 
    257         my $plugin_type = $self->{'plugin_type'}; 
    258          
    259         $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
    260         $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
    261         $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 
    262         $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 
    263         $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 
    264          
    265         # include any metadata passed in from previous plugins  
    266         # note that this metadata is associated with the top level section 
    267         $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
    268          
    269         # do plugin specific processing of doc_obj 
    270         my $textref = \$self->{'rawxml'}; 
    271         unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 
    272         print STDERR "<ProcessingError n='$file'>\n" if ($gli); 
    273         return -1; 
    274         } 
    275          
    276         # do any automatic metadata extraction 
    277         $self->auto_extract_metadata ($doc_obj); 
    278          
    279         # add an OID 
    280         $self->add_OID($doc_obj); 
    281          
    282         my $prettymds = $self->{'prettymd'}; 
    283         foreach my $prettymd (@$prettymds) { 
    284         $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd); 
    285         } 
    286         $self->{'prettymd'} = undef; 
    287          
    288         # process the document 
    289         $processor->process($doc_obj); 
    290          
    291         $self->{'num_processed'} ++; 
    292          
    293         return 1; # processed the file 
    294     } 
    295     } 
    296     else { 
    297     return undef; 
    298     } 
     286     
     287    my $filename = $file; 
     288    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     289 
     290    # Do encoding stuff on metadata 
     291    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 
     292 
     293    # create a new document 
     294    my $doc_obj = new doc ($filename, "indexed_doc"); 
     295    my $top_section = $doc_obj->get_top_section; 
     296    my $plugin_type = $self->{'plugin_type'}; 
     297     
     298    $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
     299    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
     300    $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 
     301    $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 
     302    $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 
     303     
     304    # include any metadata passed in from previous plugins  
     305    # note that this metadata is associated with the top level section 
     306    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
     307     
     308    # do plugin specific processing of doc_obj 
     309    print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 
     310    my $text = $self->{'oai-files'}->{$file}->{'rawxml'}; 
     311    delete $self->{'oai-files'}->{$file}; 
     312 
     313    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 
     314    print STDERR "<ProcessingError n='$file'>\n" if ($gli); 
     315    return -1; 
     316    } 
     317     
     318    # do any automatic metadata extraction 
     319    $self->auto_extract_metadata ($doc_obj); 
     320     
     321    # add an OID 
     322    $self->add_OID($doc_obj); 
     323         
     324    # process the document 
     325    $processor->process($doc_obj); 
     326     
     327    $self->{'num_processed'} ++; 
     328     
     329    return 1; # processed the file 
    299330} 
    300331 
     
    302333sub read_old { 
    303334    my $self = shift (@_);   
    304    
     335     
    305336    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    306337 
     
    436467    my $style = "style=\'border-bottom: 4px solid #000080\'"; 
    437468 
    438     $self->{'ppmd_table'} = "\n<table $att $style>"; 
     469    $self->{'ppmd_table'} = "\n<table $att $style>"; 
    439470} 
    440471