Changeset 7204


Ignore:
Timestamp:
2004-04-19T17:14:16+12:00 (20 years ago)
Author:
mdewsnip
Message:

Changed the processing of the doc.xml files so metadata from bibliographic files isn't completely ignored -- now the values are, but the metadata elements are extracted and added to the extracted metadata set.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java

    r6859 r7204  
    6161
    6262    private GShell shell;
    63 
    64     static final String ignore_list[] = {"assocfilepath", "gsdl",  "Identifier", "URL"}; //"Source",
     63    private String file_path;
     64
     65    static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
    6566
    6667    public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
     
    103104    Document document = Utility.parse(file, false);
    104105
    105     Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());
    106106    // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
    107     if(document != null) {
    108         String file_path = null;
     107    if (document != null) {
     108        file_path = null;
    109109        Element archive_element = document.getDocumentElement();
    110110        // Retrieve the initial Section element
    111111        NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
    112         // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements.
    113         if(section_elements.getLength() < 1) {
     112        // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
     113        if (section_elements.getLength() < 1) {
    114114        return count;
    115115        }
    116116        Element section_element = (Element) section_elements.item(0);       
    117117        section_elements = null;
    118         // Retrieve all of the Metadata sections.
     118
     119        // Retrieve all of the Metadata sections
    119120        NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
    120121        section_element = null;
    121         // We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!)
    122         for(int i = 0; i < metadata_elements.getLength(); i++) {
     122
     123        // Zip through the retrieved metadata checking for SourceSegment elements
     124        // These are a good sign of bibliographic files, which we must handle specially
     125        boolean ignore_values = false;
     126        for (int i = 0; i < metadata_elements.getLength(); i++) {
    123127        Element metadata_element = (Element) metadata_elements.item(i);
    124128        String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
    125         if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
    126             return 0;
    127         }
    128         }
     129        if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
     130            ignore_values = true;
     131            break;
     132        }
     133        }
     134
    129135        // Now for each Metadata entry retrieved...
    130         for(int i = 0; i < metadata_elements.getLength(); i++) {
     136        for (int i = 0; i < metadata_elements.getLength(); i++) {
    131137        Element metadata_element = (Element) metadata_elements.item(i);
    132         String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
    133         // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
    134         if(name.equals("gsdlsourcefilename")) {
    135             file_path = MSMUtils.getValue(metadata_element);
    136         }
    137         else {
    138             // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
    139             boolean ignore = false;
    140             for(int j = 0; !ignore && j < ignore_list.length; j++) {
    141             ignore = name.startsWith(ignore_list[j]);
    142             }
    143             // Otherwise ensure the metadata is present in our collection.
    144             if(!ignore && file_path != null) {
    145             // If we successfully retrieved a record we can continue.
    146             if(file_path != null) {
    147                 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
    148                 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
    149                 if(element == null) {
    150                 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
    151                 if(extracted_mds != null) {
    152                     element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
    153                 }
    154                 }
    155                 // If we successfully retrieved an element (and we should have) we can continue.
    156                 // WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy.
    157                 if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) {
    158                 // Retrieve the metadata for the current file
    159                 File target_file = new File(file_path);
    160                 String value = "";
    161                 try {
    162                     value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
    163                 }
    164                 catch(IllegalArgumentException error) { // ****
    165                     value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
    166                 }
    167                 catch(UnsupportedEncodingException error) {
    168                     Gatherer.printStackTrace(error);
    169                 }
    170                 // If we successfully retrieved a value we can continue.
    171                 if(value != null) {
    172                     // Create a new metadata object.
    173                     GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
    174                     GValueNode value_node = null;
    175                     if(value_tree != null) {
    176                     value_node = value_tree.getValue(value);
    177                     }
    178                     else {
    179                     value_node = new GValueNode(element.toString(), value);
    180                     }
    181                     Metadata metadata = new Metadata(element, value_node);
    182                     element.inc();
    183                     ///ystem.err.println("Adding extracted metadata: " + metadata);
    184                     Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
    185                     count++;
    186                     // All done. On to next metadata.
    187                 }
    188                 value = null;
    189                 target_file = null;
    190                 }
    191                 else {
    192                 Gatherer.println("Cannot retrieve metadata element " + name);
    193                 }
    194             }
    195             }
    196         }
    197         }
    198     }
     138        if (processMetadataElement(metadata_element, ignore_values) == true) {
     139            count++;
     140        }
     141        }
     142    }
     143
    199144    return count;
    200145    }
     146
     147
     148    private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
     149    {
     150    String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
     151
     152    // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
     153    if (name.equals("gsdlsourcefilename")) {
     154        file_path = MSMUtils.getValue(metadata_element);
     155        return false;
     156    }
     157
     158    // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
     159    for (int j = 0; j < ignore_list.length; j++) {
     160        if (name.startsWith(ignore_list[j])) {
     161        return false;
     162        }
     163    }
     164
     165    if (file_path == null) {
     166        return false;
     167    }
     168
     169    // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
     170    ElementWrapper element = Gatherer.c_man.msm.getElement(name);
     171    if (element == null) {
     172        MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
     173        if (extracted_mds != null) {
     174        element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
     175        }
     176    }
     177
     178    // If ignore_values is set (bibliographic records) we don't care about the values
     179    if (ignore_values == true) {
     180        return false;
     181    }
     182
     183    // Retrieve the metadata for the current file
     184    File target_file = new File(file_path);
     185    String value = "";
     186    try {
     187        value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
     188    }
     189    catch (IllegalArgumentException error) { // ****
     190        value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
     191    }
     192    catch (UnsupportedEncodingException error) {
     193        Gatherer.printStackTrace(error);
     194    }
     195
     196    if (value == null) {
     197        return false;
     198    }
     199
     200    // Create a new metadata object.
     201    GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
     202    GValueNode value_node = null;
     203    if (value_tree != null) {
     204        value_node = value_tree.getValue(value);
     205    }
     206    else {
     207        value_node = new GValueNode(element.toString(), value);
     208    }
     209
     210    Metadata metadata = new Metadata(element, value_node);
     211    element.inc();
     212    ///ystem.err.println("Adding extracted metadata: " + metadata);
     213    Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
     214
     215    return true;
     216    }
     217
    201218
    202219    static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
Note: See TracChangeset for help on using the changeset viewer.