Changeset 11298


Ignore:
Timestamp:
2006-03-02T14:53:11+13:00 (18 years ago)
Author:
mdewsnip
Message:

Fairly substantial changes in order to deal with repeatable/non-repeatable fields correctly. This is now read from the .fdt file, instead of special-casing Keyword metadata.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ISISPlug.pm

    r11295 r11298  
    143143    # Parse the associated ISIS database Field Definition Table file (.fdt)
    144144    my %fdtmapping = &parse_field_definition_table($fdtfilename, $encoding);
    145 
    146     # Map the tag numbers to tag names, using the FDT mapping
    147     $$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
     145    $self->{'fdt_mapping'} = \%fdtmapping;
    148146
    149147    # Remove the line at the start so it is split and processed properly
     
    158156    my $outhandle = $self->{'outhandle'};
    159157
    160     my $cursection = $doc_obj->get_top_section();
     158    my $section = $doc_obj->get_top_section();
     159    my $fdt_mapping = $self->{'fdt_mapping'};
    161160    my $subfield_separator = $self->{'subfield_separator'};
    162161    my $entry_separator = $self->{'entry_separator'};
     
    164163    # Report that we're processing the file
    165164    print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
    166     print $outhandle "IsisPlug: processing $file\n"
    167     if ($self->{'verbosity'}) > 1;
     165    print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
    168166
    169167    # Process each line of the ISIS record, one at a time
    170168    foreach my $line (split(/\n/, $$textref)) {
    171169    $line =~ /^tag=(.*) data=(.+)$/;
    172     my $rawtagname = $1;
    173     my $rawtagdata = $2;
    174     # print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
    175     next if ($rawtagname eq "");
     170    my $tag = $1;
     171    my $tag_data = $2;
     172        # print STDERR "\nTag: $tag, Data: $tag_data\n";
     173
     174    # Convert the tag number into a name, and remove any invalid characters
     175    my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
     176    $raw_metadata_name =~ s/&//g;
     177    next if ($raw_metadata_name eq "");
    176178
    177179    # Metadata field names: title case, then remove spaces
    178     my $tagname = "";
    179     foreach my $word (split(/\s+/, $rawtagname)) {
     180    my $metadata_name = "";
     181    foreach my $word (split(/\s+/, $raw_metadata_name)) {
    180182        substr($word, 0, 1) =~ tr/a-z/A-Z/;
    181         $tagname .= $word;
    182     }
    183 
    184     # Make sure there is nothing bad in the tag names
    185     $tagname =~ s/&//g;
    186 
    187     # Handle each piece of metadata ('%' separated)
    188     my $completetagvalue = "";
    189     foreach my $rawtagvalue (split(/%/, $rawtagdata)) {
    190         $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
    191 
    192         # Metadata field values: take care with subfields
    193         my $completeentryvalue = "";
    194         while ($rawtagvalue ne "") {
     183        $metadata_name .= $word;
     184    }
     185
     186    my $all_metadata_name = $metadata_name . "^all";
     187    my $all_metadata_value = "";
     188
     189    # Handle repeatable fields
     190    if ($fdt_mapping->{$tag}{'repeatable'}) {
     191        # Multiple values are separated using the '%' character
     192        foreach my $raw_metadata_value (split(/%/, $tag_data)) {
     193        my $metadata_value = "";
     194
     195        # Handle subfields
     196        while ($raw_metadata_value ne "") {
     197            # If there is a subfield specifier, parse it off
     198            my $sub_metadata_name = $metadata_name;
     199            if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ /^([a-z])/) {
     200            $sub_metadata_name .= "^$1";
     201            }
     202
     203            # Parse the value off and add it as metadata
     204            $raw_metadata_value =~ s/^([^\^]*)//;
     205            my $sub_metadata_value = $1;
     206
     207            # Escape any '<' and '>' characters so they appear correctly in the final collection
     208            $sub_metadata_value =~ s/\</&lt;/g;
     209            $sub_metadata_value =~ s/\>/&gt;/g;
     210
     211            # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
     212            if ($sub_metadata_name ne $metadata_name) {
     213            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
     214            }
     215
     216            $metadata_value .= $subfield_separator unless ($metadata_value eq "");
     217            $metadata_value .= $sub_metadata_value;
     218        }
     219
     220        # Add the metadata value
     221        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
     222        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
     223
     224        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
     225        $all_metadata_value .= $metadata_value;
     226        }
     227    }
     228
     229    # Handle non-repeatable fields
     230    else {
     231        my $raw_metadata_value = $tag_data;
     232        my $metadata_value = "";
     233
     234        # Handle subfields
     235        while ($raw_metadata_value ne "") {
    195236        # If there is a subfield specifier, parse it off
    196         my $subfieldname = "";
    197         if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
    198             $subfieldname = "^$1";
     237        my $sub_metadata_name = $metadata_name;
     238        if ($raw_metadata_value =~ s/^(\^[a-z])//) {
     239            $sub_metadata_name .= $1;
    199240        }
    200241
    201         # Parse the metadata value off
    202         $rawtagvalue =~ s/^([^\^]*)//;
    203         my $metadatafieldname = $tagname . $subfieldname;
    204         my $metadatafieldvalue = $1;
    205        
    206         # Handle Keywords specially
    207         if ($metadatafieldname eq "Keywords") {
    208             my $keywordmetadatavalue = $metadatafieldvalue;
    209             my $keywordlist = "";
    210             while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
    211             my $keyword = $1;
    212             $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
    213             $keywordlist .= ", " unless ($keywordlist eq "");
    214             $keywordlist .= $keyword;
     242        # Parse the value off and add it as metadata
     243        $raw_metadata_value =~ s/^([^\^]*)//;
     244        my $sub_metadata_value = $1;
     245
     246        # Deal with the case when multiple values are specified using <...>
     247        if ($sub_metadata_value =~ /\<(.*)\>$/) {
     248            my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
     249            my $tmp_sub_metadata_value = $sub_metadata_value;
     250            while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
     251            my $sub_sub_metadata_value = $1;
     252            $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
    215253            }
    216 
    217             $metadatafieldvalue = $keywordlist;
    218254        }
    219255
    220256        # Escape any '<' and '>' characters so they appear correctly in the final collection
    221         $metadatafieldvalue =~ s/\</&lt;/g;
    222         $metadatafieldvalue =~ s/\>/&gt;/g;
    223 
    224         # We have already added Keywords metadata above
    225         unless ($metadatafieldname eq "Keywords") {
    226             $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
     257        $sub_metadata_value =~ s/\</&lt;/g;
     258        $sub_metadata_value =~ s/\>/&gt;/g;
     259
     260        # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
     261        if ($sub_metadata_name ne $metadata_name) {
     262            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
    227263        }
    228264
    229         $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
    230         $completeentryvalue .= $metadatafieldvalue;
     265        $metadata_value .= $subfield_separator unless ($metadata_value eq "");
     266        $metadata_value .= $sub_metadata_value;
    231267        }
    232268
    233         $completetagvalue .= $completeentryvalue;
    234     }
    235 
    236     $doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
     269        # Add the metadata value
     270        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
     271        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
     272
     273        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
     274        $all_metadata_value .= $metadata_value;
     275    }
     276
     277    # Add the "^all" metadata value
     278    # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
     279    $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
    237280    }
    238281
     
    240283    $$textref =~ s/\</&lt;/g;
    241284    $$textref =~ s/\>/&gt;/g;
    242     $doc_obj->add_utf8_text($cursection, $$textref);
     285    $doc_obj->add_utf8_text($section, $$textref);
    243286
    244287    # Add FileFormat metadata
    245     $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
    246 
    247     # Record was processed successfully (and there was no document obtained)
     288    $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
     289
     290    # Record was processed successfully
    248291    return 1;
    249292}
     
    270313
    271314    if ($amongstdefinitions) {
    272         my $fieldtitle     = substr($fdtfileline,  0, 30);
     315        my $fieldname      = substr($fdtfileline,  0, 30);
    273316        my $fieldsubfields = substr($fdtfileline, 30, 20);
    274317        my $fieldspecs     = substr($fdtfileline, 50);
    275318
    276319        # Remove extra spaces
    277         $fieldtitle =~ s/(\s*)$//;
     320        $fieldname =~ s/(\s*)$//;
    278321        $fieldsubfields =~ s/(\s*)$//;
    279 
    280         # Map from tag number to metadata field title and subfields
    281         my ($fieldtag) = ($fieldspecs =~ /^\s*(\d+)\s+/);
    282         $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
    283                        'subfields' => $fieldsubfields };
     322        $fieldspecs =~ s/(\s*)$//;
     323
     324        # Map from tag number to metadata field title, subfields, and repeatability
     325        my $fieldtag = (split(/ /, $fieldspecs))[0];
     326        my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
     327        $fdtmapping{$fieldtag} = { 'name' => $fieldname,
     328                       'subfields' => $fieldsubfields,
     329                       'repeatable' => $fieldrepeatable };
    284330    }
    285331    elsif ($fdtfileline eq "***") {
Note: See TracChangeset for help on using the changeset viewer.