Ignore:
Timestamp:
2008-09-18T10:03:44+12:00 (16 years ago)
Author:
kjdon
Message:

tidied this up and removed some old code

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17300 r17319  
    4848    'reqd' => "no",
    4949    'deft' => &get_default_process_exp() },
    50       { 'name' => "xxx",
    51     'desc' => "{OAIPlugin.xxx}",
     50      { 'name' => "document_field",
     51    'desc' => "{OAIPlugin.document_field}",
    5252    'type' => "metadata",
    5353    'reqd' => "no",
     
    176176    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    177177    return undef unless $self->can_process_this_file($filename_full_path);
    178 #    print STDERR "initial\n";
    179 #   foreach my $k (keys %$metadata) {
    180 #   print STDERR "$k=".join (", ", @{$metadata->{$k}})."; ";
    181 #    }
    182 #    print STDERR "\n";
    183    
    184     my $total_count = 0; # is total count used?
     178   
    185179    if (!$self->parse_file($filename_full_path, $file, $gli)) {
    186180    $self->{'saved_metadata'} = undef;
     
    190184    my $new_metadata = $self->{'saved_metadata'};
    191185    $self->{'saved_metadata'} = undef;
     186
    192187    # add the pretty metadata table as metadata
    193188    my $ppmd_table = $self->{'ppmd_table'};
    194189    $new_metadata->{'prettymd'} = $ppmd_table;
    195190    $self->{'ppmd_table'} = undef;
    196    
    197     print STDERR "after parse\n";
    198     foreach my $k (keys %$new_metadata) {
    199     print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; ";
    200     }
    201     print STDERR "\n";
    202    
    203    
    204     #   if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) {
    205     # calling "SUPER::read" at this point sets up $metadata
    206     # data-structure.  We can then, later, in OAIPlug::read decide
    207     # whether this $metadata will stick to an accompanying file,
    208     # or else needs a new doc object to be formed that contains
    209     # purely metadata
    210    
    211 #   $self->{'metadata'} = undef;
    212 #   print STDERR "after erad\n";
    213 #   foreach my $k (keys %$metadata) {
    214 #       print STDERR "$k=".join (", ", @{$metadata->{$k}})."; ";
    215 #   }
    216 #   print STDERR "\n";
    217     my $url_array = $new_metadata->{'dc.Identifier'};
     191     
     192    my $document_metadata_field = $self->{'document_field'};
     193    my $url_array = $new_metadata->{$document_metadata_field};
    218194    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
    219     print STDERR "$num_urls urls for $file\n";
     195    ##print STDERR "$num_urls urls for $file\n";
    220196    my $srcdoc_exists = 0;
    221197    my $srcdoc_pos = 0;
    222198    my $filename_dir = &util::filename_head($filename_full_path);
    223     my $filename_for_metadata = $file;
     199    my $filename_for_metadata = $file; # this assumes there will only be one record per oai file - is this always the case??
    224200    for (my $i=0; $i<$num_urls; $i++) {
    225201   
     
    237213    }
    238214   
    239    
    240     if ($srcdoc_exists)
    241     {
     215    if ($srcdoc_exists) {
    242216    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
    243217    }
     
    247221    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
    248222    $self->{'rawxml'} = "";
    249     print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n";
    250     }
    251    
    252 ###     print STDERR "**** storing OAI file: $file\n";
     223    }
    253224   
    254225    # return all the metadata we have extracted to the caller.
     
    267238    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    268239
    269 
    270 ###    print STDERR "**** checking OAI read: $file\n";
    271 
    272240    if (!defined $self->{'oai-files'}->{$file}) {
    273241    return undef;
    274242    }
    275    
    276    
     243       
    277244    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
    278245    if ($srcdoc_exists) {
    279     # do nothing more
     246    # do nothing more - all the metadata has been extracted and associated with the srcdoc
    280247    # no more need to access details of this $file => tidy up as you go
    281248    delete $self->{'oai-files'}->{$file};
     
    283250    }
    284251
    285 ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";
    286    
    287252    my $filename = $file;
    288253    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     
    304269    # include any metadata passed in from previous plugins
    305270    # note that this metadata is associated with the top level section
     271    # this will include all the metadata from the oai file that we extracted
     272    # during metadata_read
    306273    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    307274   
    308275    # do plugin specific processing of doc_obj
    309     print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n";
    310276    my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
    311277    delete $self->{'oai-files'}->{$file};
     
    331297
    332298
    333 sub read_old {
    334     my $self = shift (@_); 
    335    
    336     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    337 
    338     my $outhandle = $self->{'outhandle'};
    339 
    340     my $filename = $file;
    341     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    342 
    343     # block the srcdocs dir - we will process files in them when we find an OAI record for them
    344     return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
    345     if ($self->SUPER::read(@_)) {
    346     # Do encoding stuff
    347     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    348    
    349     my $url_array = $metadata->{'dc.Identifier'};
    350     my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
    351    
    352     my $srcdoc_exists = 0;
    353     my $srcdoc_pos = 0;
    354     my $filename_dir = &util::filename_head($filename);
    355    
    356     for (my $i=0; $i<$num_urls; $i++) {
    357         if ($url_array->[$i] !~ m/^(http|ftp):/) {
    358        
    359         my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
    360         if (-e $src_filename) {
    361             $srcdoc_pos = $i;
    362             $srcdoc_exists = 1;
    363         }
    364         }
    365     }
    366    
    367     if ($srcdoc_exists)
    368     {
    369         print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
    370         if ($self->{'verbosity'}>1);
    371        
    372        
    373         # Make pretty print metadata table stick with src filename
    374         my $ppmd_table = $self->{'ppmd_table'};
    375         $metadata->{'prettymd'} = [ $ppmd_table ];
    376         $self->{'ppmd_table'} = undef;
    377        
    378         return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
    379                   $block_hash, $metadata, $processor, $maxdocs,
    380                   $total_count, $gli);
    381     }
    382     else
    383     {
    384         # create a new document
    385         my $doc_obj = new doc ($filename, "indexed_doc");
    386         my $top_section = $doc_obj->get_top_section;
    387         my $plugin_type = $self->{'plugin_type'};
    388        
    389         $doc_obj->add_utf8_metadata($top_section, "Language", $language);
    390         $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
    391         $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
    392         $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
    393         $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
    394        
    395         # include any metadata passed in from previous plugins
    396         # note that this metadata is associated with the top level section
    397         $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    398        
    399         # do plugin specific processing of doc_obj
    400         my $textref = \$self->{'rawxml'};
    401         unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
    402         print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    403         return -1;
    404         }
    405        
    406         # do any automatic metadata extraction
    407         $self->auto_extract_metadata ($doc_obj);
    408        
    409         # add an OID
    410         $self->add_OID($doc_obj);
    411        
    412         my $ppmd_table = $self->{'ppmd_table'};
    413         $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
    414         $self->{'ppmd_table'} = undef;
    415        
    416         # process the document
    417         $processor->process($doc_obj);
    418        
    419         $self->{'num_processed'} ++;
    420        
    421         return 1; # processed the file
    422     }
    423     }
    424     else {
    425     return undef;
    426     }
    427 }
    428 
    429 
    430299# do plugin specific processing of doc_obj
    431300sub process {
     
    450319    $$textref =~ s/\]/&#93;/g;
    451320
    452 ##    print STDERR "*** adding text: $$textref\n";
    453    
    454321    $doc_obj->add_utf8_text($cursection, $$textref);
    455322
     
    475342    my ($metaname, $metavalue_utf8) = @_;
    476343
    477 ###    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
    478344    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
    479345
     
    596462    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
    597463    {
    598         # if URL given for document as identifier metadata, store it ...
    599         # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    600464
    601465        my $metaname = $1;
    602466        my $metavalue = $2;
    603467        $inner_metadata_text = $3;
    604 
    605 #       print STDERR "*** metaname = $metaname\n";
    606 #       print STDERR "*** metavalue = $metavalue\n";
    607468
    608469        # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
     
    611472        {
    612473        $metaname = "$top_level_prefix.$metaname";
    613 #       print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
    614474        }
    615475        $metaname =~ s/\.(.)/\.\u$1/;
     
    619479        $metavalue =~ s/\[/&#91;/g;
    620480        $metavalue =~ s/\]/&#93;/g;
    621 
    622 
    623 #       if ($metaname eq "Identifier")
    624 #       {
    625 #       # name clashes with GSDL reserved metadata name for hash id
    626 #       $metaname = "URL";
    627 #       }
    628481
    629482        if (defined $metadata->{$metaname})
Note: See TracChangeset for help on using the changeset viewer.