Changeset 17319

Show
Ignore:
Timestamp:
18.09.2008 10:03:44 (11 years ago)
Author:
kjdon
Message:

tidied this up and removed some old code

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r17300 r17319  
    4848    'reqd' => "no", 
    4949    'deft' => &get_default_process_exp() }, 
    50       { 'name' => "xxx", 
    51     'desc' => "{OAIPlugin.xxx}", 
     50      { 'name' => "document_field", 
     51    'desc' => "{OAIPlugin.document_field}", 
    5252    'type' => "metadata", 
    5353    'reqd' => "no", 
     
    176176    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    177177    return undef unless $self->can_process_this_file($filename_full_path); 
    178 #    print STDERR "initial\n"; 
    179 #   foreach my $k (keys %$metadata) { 
    180 #   print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 
    181 #    } 
    182 #    print STDERR "\n"; 
    183      
    184     my $total_count = 0; # is total count used? 
     178     
    185179    if (!$self->parse_file($filename_full_path, $file, $gli)) { 
    186180    $self->{'saved_metadata'} = undef; 
     
    190184    my $new_metadata = $self->{'saved_metadata'}; 
    191185    $self->{'saved_metadata'} = undef; 
     186 
    192187    # add the pretty metadata table as metadata 
    193188    my $ppmd_table = $self->{'ppmd_table'}; 
    194189    $new_metadata->{'prettymd'} = $ppmd_table; 
    195190    $self->{'ppmd_table'} = undef; 
    196      
    197     print STDERR "after parse\n"; 
    198     foreach my $k (keys %$new_metadata) { 
    199     print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; "; 
    200     } 
    201     print STDERR "\n"; 
    202      
    203      
    204     #   if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) { 
    205     # calling "SUPER::read" at this point sets up $metadata 
    206     # data-structure.  We can then, later, in OAIPlug::read decide 
    207     # whether this $metadata will stick to an accompanying file, 
    208     # or else needs a new doc object to be formed that contains 
    209     # purely metadata 
    210      
    211 #   $self->{'metadata'} = undef; 
    212 #   print STDERR "after erad\n"; 
    213 #   foreach my $k (keys %$metadata) { 
    214 #       print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 
    215 #   } 
    216 #   print STDERR "\n"; 
    217     my $url_array = $new_metadata->{'dc.Identifier'}; 
     191       
     192    my $document_metadata_field = $self->{'document_field'}; 
     193    my $url_array = $new_metadata->{$document_metadata_field}; 
    218194    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 
    219     print STDERR "$num_urls urls for $file\n"; 
     195    ##print STDERR "$num_urls urls for $file\n"; 
    220196    my $srcdoc_exists = 0; 
    221197    my $srcdoc_pos = 0; 
    222198    my $filename_dir = &util::filename_head($filename_full_path); 
    223     my $filename_for_metadata = $file; 
     199    my $filename_for_metadata = $file; # this assumes there will only be one record per oai file - is this always the case?? 
    224200    for (my $i=0; $i<$num_urls; $i++) { 
    225201     
     
    237213    } 
    238214     
    239      
    240     if ($srcdoc_exists) 
    241     { 
     215    if ($srcdoc_exists) { 
    242216    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 
    243217    } 
     
    247221    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'}; 
    248222    $self->{'rawxml'} = ""; 
    249     print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 
    250     } 
    251      
    252 ###     print STDERR "**** storing OAI file: $file\n"; 
     223    } 
    253224     
    254225    # return all the metadata we have extracted to the caller. 
     
    267238    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    268239 
    269  
    270 ###    print STDERR "**** checking OAI read: $file\n"; 
    271  
    272240    if (!defined $self->{'oai-files'}->{$file}) { 
    273241    return undef; 
    274242    } 
    275      
    276      
     243         
    277244    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 
    278245    if ($srcdoc_exists) { 
    279     # do nothing more 
     246    # do nothing more - all the metadata has been extracted and associated with the srcdoc 
    280247    # no more need to access details of this $file => tidy up as you go 
    281248    delete $self->{'oai-files'}->{$file}; 
     
    283250    } 
    284251 
    285 ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";  
    286      
    287252    my $filename = $file; 
    288253    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     
    304269    # include any metadata passed in from previous plugins  
    305270    # note that this metadata is associated with the top level section 
     271    # this will include all the metadata from the oai file that we extracted 
     272    # during metadata_read 
    306273    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
    307274     
    308275    # do plugin specific processing of doc_obj 
    309     print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 
    310276    my $text = $self->{'oai-files'}->{$file}->{'rawxml'}; 
    311277    delete $self->{'oai-files'}->{$file}; 
     
    331297 
    332298 
    333 sub read_old { 
    334     my $self = shift (@_);   
    335      
    336     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    337  
    338     my $outhandle = $self->{'outhandle'}; 
    339  
    340     my $filename = $file; 
    341     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
    342  
    343     # block the srcdocs dir - we will process files in them when we find an OAI record for them 
    344     return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/)); 
    345     if ($self->SUPER::read(@_)) { 
    346     # Do encoding stuff 
    347     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 
    348      
    349     my $url_array = $metadata->{'dc.Identifier'}; 
    350     my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 
    351      
    352     my $srcdoc_exists = 0; 
    353     my $srcdoc_pos = 0; 
    354     my $filename_dir = &util::filename_head($filename); 
    355      
    356     for (my $i=0; $i<$num_urls; $i++) { 
    357         if ($url_array->[$i] !~ m/^(http|ftp):/) { 
    358          
    359         my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 
    360         if (-e $src_filename) { 
    361             $srcdoc_pos = $i; 
    362             $srcdoc_exists = 1; 
    363         } 
    364         } 
    365     } 
    366      
    367     if ($srcdoc_exists) 
    368     { 
    369         print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n" 
    370         if ($self->{'verbosity'}>1); 
    371          
    372          
    373         # Make pretty print metadata table stick with src filename 
    374         my $ppmd_table = $self->{'ppmd_table'}; 
    375         $metadata->{'prettymd'} = [ $ppmd_table ]; 
    376         $self->{'ppmd_table'} = undef; 
    377          
    378         return &plugin::read ($pluginfo, $filename_dir, $url_array->[0], 
    379                   $block_hash, $metadata, $processor, $maxdocs, 
    380                   $total_count, $gli); 
    381     } 
    382     else 
    383     { 
    384         # create a new document 
    385         my $doc_obj = new doc ($filename, "indexed_doc"); 
    386         my $top_section = $doc_obj->get_top_section; 
    387         my $plugin_type = $self->{'plugin_type'}; 
    388          
    389         $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
    390         $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
    391         $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 
    392         $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 
    393         $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 
    394          
    395         # include any metadata passed in from previous plugins  
    396         # note that this metadata is associated with the top level section 
    397         $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
    398          
    399         # do plugin specific processing of doc_obj 
    400         my $textref = \$self->{'rawxml'}; 
    401         unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 
    402         print STDERR "<ProcessingError n='$file'>\n" if ($gli); 
    403         return -1; 
    404         } 
    405          
    406         # do any automatic metadata extraction 
    407         $self->auto_extract_metadata ($doc_obj); 
    408          
    409         # add an OID 
    410         $self->add_OID($doc_obj); 
    411          
    412         my $ppmd_table = $self->{'ppmd_table'}; 
    413         $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table); 
    414         $self->{'ppmd_table'} = undef; 
    415          
    416         # process the document 
    417         $processor->process($doc_obj); 
    418          
    419         $self->{'num_processed'} ++; 
    420          
    421         return 1; # processed the file 
    422     } 
    423     } 
    424     else { 
    425     return undef; 
    426     } 
    427 } 
    428  
    429  
    430299# do plugin specific processing of doc_obj 
    431300sub process { 
     
    450319    $$textref =~ s/\]/&#93;/g; 
    451320 
    452 ##    print STDERR "*** adding text: $$textref\n"; 
    453      
    454321    $doc_obj->add_utf8_text($cursection, $$textref); 
    455322 
     
    475342    my ($metaname, $metavalue_utf8) = @_; 
    476343 
    477 ###    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/; 
    478344    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8); 
    479345 
     
    596462    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s) 
    597463    { 
    598         # if URL given for document as identifier metadata, store it ... 
    599         # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 
    600464 
    601465        my $metaname = $1; 
    602466        my $metavalue = $2; 
    603467        $inner_metadata_text = $3; 
    604  
    605 #       print STDERR "*** metaname = $metaname\n"; 
    606 #       print STDERR "*** metavalue = $metavalue\n"; 
    607468 
    608469        # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter 
     
    611472        { 
    612473        $metaname = "$top_level_prefix.$metaname"; 
    613 #       print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n"; 
    614474        } 
    615475        $metaname =~ s/\.(.)/\.\u$1/; 
     
    619479        $metavalue =~ s/\[/&#91;/g; 
    620480        $metavalue =~ s/\]/&#93;/g; 
    621  
    622  
    623 #       if ($metaname eq "Identifier") 
    624 #       { 
    625 #       # name clashes with GSDL reserved metadata name for hash id 
    626 #       $metaname = "URL"; 
    627 #       } 
    628481 
    629482        if (defined $metadata->{$metaname})