Changeset 9958


Ignore:
Timestamp:
2005-05-25T17:31:13+12:00 (19 years ago)
Author:
davidb
Message:

Upgrading of OAIPlug to be based on XMLPlug. OAIPlug was originally written
before XMLPlug. The rewrite means it can now take advantage of the -xslt
option.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/OAIPlug.pm

    r9853 r9958  
    3232use parsargv;
    3333
     34use XMLPlug;
     35
    3436sub BEGIN {
    35     @OAIPlug::ISA = ('BasPlug');
    36 }
     37    @OAIPlug::ISA = ('XMLPlug');
     38}
     39
    3740
    3841my $arguments =
     
    5255sub new {
    5356    my $class = shift (@_);
    54     my $self = new BasPlug ($class, @_);
     57    my $self = new XMLPlug ($class, @_);
    5558    $self->{'plugin_type'} = "OAIPlug";
    5659    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     
    7578}
    7679
     80sub xml_start_document {
     81    $self->{'in_metadata_node'} = 0;
     82    $self->{'rawxml'} = "";
     83}
     84
     85sub xml_end_document {
     86}
     87
     88sub xml_doctype {
     89    my $self = shift(@_);
     90
     91    my ($expat, $name, $sysid, $pubid, $internal) = @_;
     92
     93    # allow the short-lived and badly named "GreenstoneArchive" files to be processed
     94    # as well as the "Archive" files which should now be created by import.pl
     95    die "" if ($name !~ /^OAI-PMH$/);
     96
     97    my $outhandle = $self->{'outhandle'};
     98    print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
     99    print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
     100
     101}
     102
     103
     104sub xml_start_tag {
     105    my $self = shift(@_);
     106    my ($expat,$element) = @_;
     107
     108    my %attr_hash = %_;
     109
     110    my $attr = "";
     111    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
     112
     113    $self->{'rawxml'} .= "<$element$attr>";
     114
     115    if ($element eq "metadata") {
     116    $self->{'in_metadata_node'} = 1;
     117    $self->{'metadata_xml'} = "";
     118    }
     119
     120    if ($self->{'in_metadata_node'}) {
     121    $self->{'metadata_xml'} .= "<$element$attr>";
     122    }
     123}
     124
     125sub xml_end_tag {
     126    my $self = shift(@_);
     127    my ($expat, $element) = @_;
     128
     129    $self->{'rawxml'} .= "</$element>";
     130
     131    if ($self->{'in_metadata_node'}) {
     132    $self->{'metadata_xml'} .= "</$element>";
     133    }
     134
     135    if ($element eq "metadata") {
     136    my $textref = \$self->{'metadata_xml'};
     137    my $metadata = $self->{'metadata'};
     138    $self->extract_oai_metadata($textref,$metadata);
     139
     140    $self->{'in_metadata_node'} = 0;   
     141    }
     142
     143
     144}
     145
     146sub xml_text {
     147    my $self = shift(@_);
     148    my ($expat) = @_;
     149
     150    $self->{'rawxml'} .= $_;
     151
     152    if ($self->{'in_metadata_node'}) {
     153    $self->{'metadata_xml'} .= $_;
     154    }
     155}
     156
     157
     158
    77159
    78160sub read {
     
    87169
    88170    return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
     171
     172    if ($self->SUPER::read(@_)) {
     173
     174    # Do encoding stuff
     175    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    89176   
    90     if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
    91     $self->{'num_blocked'} ++;
    92     return 0;
    93     }
    94     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     177    my $url_array = $metadata->{'URL'};
     178    my $num_urls = scalar(@$url_array);
     179   
     180    my $srcdoc_exists = 0;
     181    my $srcdoc_pos = 0;
     182    my $filename_dir = &util::filename_head($filename);
     183   
     184    for (my $i=0; $i<$num_urls; $i++) {
     185       
     186        if ($url_array->[$i] !~ m/^(http|ftp):/) {
     187       
     188        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
     189       
     190        if (-e $src_filename) {
     191            $srcdoc_pos = $i;
     192            $srcdoc_exists = 1;
     193        }
     194        }
     195    }
     196   
     197    if ($srcdoc_exists)
     198    {
     199        print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
     200        if ($self->{'verbosity'}>1);
     201       
     202       
     203        # Make pretty print metadata table stick with src filename
     204        my $ppmd_table = $self->{'ppmd_table'};
     205        $metadata->{'prettymd'} = [ $ppmd_table ];
     206        $self->{'ppmd_table'} = undef;
     207       
     208        return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
     209                  $metadata, $processor, $maxdocs, $total_count, $gli);
     210    }
     211    else
     212    {
     213        # create a new document
     214        my $doc_obj = new doc ($filename, "indexed_doc");
     215        my $top_section = $doc_obj->get_top_section;
     216        my $plugin_type = $self->{'plugin_type'};
     217       
     218        $doc_obj->add_utf8_metadata($top_section, "Language", $language);
     219        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     220        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
     221        $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
     222        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
     223       
     224        # include any metadata passed in from previous plugins
     225        # note that this metadata is associated with the top level section
     226        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     227       
     228        # do plugin specific processing of doc_obj
     229        my $textref = \$self->{'rawxml'};
     230        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
     231        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
     232        return -1;
     233        }
     234       
     235        # do any automatic metadata extraction
     236        $self->auto_extract_metadata ($doc_obj);
     237       
     238        # add an OID
     239        $doc_obj->set_OID();
     240       
     241        my $ppmd_table = $self->{'ppmd_table'};
     242        $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
     243        $self->{'ppmd_table'} = undef;
     244       
     245        # process the document
     246        $processor->process($doc_obj);
     247       
     248        $self->{'num_processed'} ++;
     249       
     250        return 1; # processed the file
     251    }
     252    }
     253    else {
    95254    return undef;
    96     }
    97     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    98 
    99     # Do encoding stuff
    100     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    101 
    102     ####
    103     # Above code exactly the same as in BasPlug
    104     # => consider making supporting function?
    105     ###
    106    
    107     # read in file ($text will be in utf8)
    108     my $text = "";
    109     $self->read_file ($filename, $encoding, $language, \$text);
    110 
    111     if (!length ($text)) {
    112     print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    113     return 0;
    114     }
    115 
    116     print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
    117     print $outhandle "OAIPlug: extracting metadata from $file\n"
    118     if ($self->{'verbosity'}>1);
    119 
    120     $self->extract_oai_metadata(\$text,$metadata);
    121 
    122     my $url_array = $metadata->{'URL'};
    123 
    124     if (defined $url_array && ($url_array->[0] !~ m/^http:/))
    125     {
    126     ## my $source_file =  &util::filename_cat($base_dir, $file);
    127 
    128     my $url_base_dir = &util::filename_head($filename);
    129 
    130 ##  print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
    131     print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
    132         if ($self->{'verbosity'}>1);
    133 
    134     return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
    135                   $metadata, $processor, $maxdocs, $total_count, $gli);
    136     }
    137     else
    138     {
    139     # create a new document
    140     my $doc_obj = new doc ($filename, "indexed_doc");
    141     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    142     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    143     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    144     $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "OAI");
    145     $doc_obj->add_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
    146 
    147 
    148     # include any metadata passed in from previous plugins
    149     # note that this metadata is associated with the top level section
    150     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    151 
    152 
    153     # do plugin specific processing of doc_obj
    154     unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
    155         print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    156         return -1;
    157     }
    158 
    159     # do any automatic metadata extraction
    160     $self->auto_extract_metadata ($doc_obj);
    161 
    162     # add an OID
    163     $doc_obj->set_OID();
    164 
    165     # process the document
    166     $processor->process($doc_obj);
    167 
    168     return 1; # processed the file
    169255    }
    170256}
     
    199285
    200286
     287# Improvement is to merge this with newer version in MetadataPass
     288
     289sub open_prettyprint_metadata_table
     290{
     291    my $self = shift(@_);
     292
     293    my $att   = "width=100% cellspacing=2";
     294    my $style = "style=\'border-bottom: 4px solid #000080\'";
     295
     296    $self->{'ppmd_table'} = "\n<table $att $style>";
     297}
     298
     299sub add_prettyprint_metadata_line
     300{
     301    my $self = shift(@_);
     302    my ($metaname, $metavalue_utf8) = @_;
     303
     304    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
     305    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
     306
     307    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
     308    $self->{'ppmd_table'} .= "    <td width=30%>\n";
     309    $self->{'ppmd_table'} .= "      $metaname\n";
     310    $self->{'ppmd_table'} .= "    </td>\n";
     311    $self->{'ppmd_table'} .= "    <td>\n";
     312    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
     313    $self->{'ppmd_table'} .= "    </td>\n";
     314    $self->{'ppmd_table'} .= "  </tr>\n";
     315
     316}
     317
     318sub close_prettyprint_metadata_table
     319{
     320    my $self = shift(@_);
     321
     322    $self->{'ppmd_table'} .= "</table>\n";
     323}
     324
     325
     326
    201327
    202328sub extract_oai_metadata {
     
    205331    my $outhandle = $self->{'outhandle'};
    206332
    207    
    208     if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
     333    # Only handles DC metadata
     334
     335    $self->open_prettyprint_metadata_table();
     336
     337    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
    209338    {
    210339    $metadata_text = $1;
     
    215344        # if URL given for document as identifier metadata, store it ...
    216345        # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
     346
    217347        my $metaname = $1;
    218348        my $metavalue = $2;
     
    237367        }
    238368
    239 
     369        $self->add_prettyprint_metadata_line($metaname, $metavalue);
     370       
    240371    }
    241372    }
     373
     374    $self->close_prettyprint_metadata_table();
    242375}
    243376
Note: See TracChangeset for help on using the changeset viewer.