Changeset 8204


Ignore:
Timestamp:
2004-10-01T15:23:57+12:00 (20 years ago)
Author:
mdewsnip
Message:

Rewrote getMetadataExtractedFromFile() to read the doc.xml files line by line (not parse as XML) to improve the speed, and also read only those lines relevant for the file we're getting the metadata for. This also required changing skimFile() to build a mapping from source filename to Description element line numbers.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r8165 r8204  
    3232import org.greenstone.gatherer.Gatherer;
    3333import org.greenstone.gatherer.util.Utility;
    34 import org.greenstone.gatherer.util.XMLTools;
    35 import org.w3c.dom.*;
    3634
    3735
     
    4038    extends File
    4139{
    42     static final private String ARCHIVE_ELEMENT = "Archive";
    43     static final private String DESCRIPTION_ELEMENT = "Description";
    44     static final private String METADATA_ELEMENT = "Metadata";
    45     static final private String SECTION_ELEMENT = "Section";
    46 
    47     private ArrayList files_in_doc_xml_file = new ArrayList();
     40    private HashMap source_file_name_to_description_elements_mapping = new HashMap();
    4841
    4942
     
    6659
    6760    // Check whether this doc.xml file contains extracted metadata for the specified file
    68     boolean contains_extracted_metadata_for_file = false;
    69     for (int i = 0; i < files_in_doc_xml_file.size(); i++) {
    70         if (file_relative_path.equals(files_in_doc_xml_file.get(i))) {
    71         contains_extracted_metadata_for_file = true;
    72         break;
     61    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     62    if (description_elements_list == null) {
     63        // ...it doesn't
     64        return metadata_values;
     65    }
     66
     67    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
     68
     69    // Parse the doc.xml file
     70    System.err.println("Applicable doc.xml file: " + this);
     71    try {
     72        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     73
     74        int description_element_num = 0;
     75        int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
     76        boolean in_relevant_description_element = false;
     77
     78        String line = null;
     79        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
     80        // Check if this line contains the start of a relevant Description element
     81        if (line_num == next_description_element_start) {
     82            in_relevant_description_element = true;
     83            continue;
     84        }
     85
     86        // If we're not in a relevant Description element we don't care about anything
     87        if (in_relevant_description_element == false) {
     88            continue;
     89        }
     90
     91        // Check if this line contains the end of the relevant Description element
     92        if (line.indexOf("</Description>") != -1) {
     93            description_element_num++;
     94            if (description_element_num == description_elements_list.size()) {
     95            break;
     96            }
     97
     98            next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
     99            in_relevant_description_element = false;
     100            continue;
     101        }
     102
     103        // If this line doesn't contain a Metadata element, we're not interested
     104        if (line.indexOf("<Metadata ") == -1) {
     105            continue;
     106        }
     107
     108        // Extract the metadata element name
     109        int name_index = line.indexOf(" name=\"") + " name=\"".length();
     110        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
     111
     112        // If the metadata has a namespace it isn't extracted metadata, so we're not interested
     113        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
     114        if (!metadata_set_namespace.equals("")) {
     115            continue;
     116        }
     117
     118        // Extracted metadata!
     119        String metadata_element_name = metadata_element_name_full;
     120
     121        // We completely ignore bibliographic data
     122        if (metadata_element_name.equals("SourceSegment")) {
     123            return new ArrayList();
     124        }
     125
     126        // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
     127        //   and those starting with '/' (/srclink)
     128        char first_character = metadata_element_name.charAt(0);
     129        if (Character.isLowerCase(first_character) || first_character == '/') {
     130            continue;
     131        }
     132
     133        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
     134
     135        // Value trees are not stored for extracted metadata, so create a new value tree node now
     136        int value_index = line.indexOf(">", name_index) + ">".length();
     137        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
     138
     139        metadata_element.addMetadataValue(metadata_element_value);
     140        MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
     141
     142        // Add the new metadata value to the list
     143        MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
     144        metadata_values.add(metadata_value);
    73145        }
    74146    }
    75 
    76     // ...it doesn't
    77     if (!contains_extracted_metadata_for_file) {
    78         return metadata_values;
    79     }
    80 
    81     // Parse the doc.xml file
    82     Gatherer.println("Applicable doc.xml file: " + this);
    83     Document document = XMLTools.parseXMLFile(this);
    84     if (document == null) {
    85         System.err.println("Error: Could not parse doc.xml file " + getAbsolutePath());
    86         return metadata_values;
    87     }
    88 
    89     MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
    90 
    91     // Read all the Archive elements in the file
    92     NodeList archive_elements_nodelist = document.getElementsByTagName(ARCHIVE_ELEMENT);
    93     for (int i = 0; i < archive_elements_nodelist.getLength(); i++) {
    94         Element current_archive_element = (Element) archive_elements_nodelist.item(i);
    95 
    96         // Read the child Section elements of the archive (but not all descendants)
    97         ArrayList child_section_elements = XMLTools.getChildElementsByTagName(current_archive_element, SECTION_ELEMENT);
    98         for (int j = 0; j < child_section_elements.size(); j++) {
    99         Element current_section_element = (Element) child_section_elements.get(j);
    100 
    101         // Read the Description elements of this section only (not child sections as well)
    102         ArrayList child_description_elements = XMLTools.getChildElementsByTagName(current_section_element, DESCRIPTION_ELEMENT);
    103         for (int k = 0; k < child_description_elements.size(); k++) {
    104             Element current_description_element = (Element) child_description_elements.get(k);
    105 
    106             // Read all the Metadata elements in this description element
    107             NodeList metadata_elements_nodelist = current_description_element.getElementsByTagName(METADATA_ELEMENT);
    108             for (int l = 0; l < metadata_elements_nodelist.getLength(); l++) {
    109             Element current_metadata_element = (Element) metadata_elements_nodelist.item(l);
    110             String metadata_element_name_full = current_metadata_element.getAttribute("name");
    111 
    112             // If the metadata has a namespace it isn't extracted metadata, so we're not interested
    113             String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
    114             if (!metadata_set_namespace.equals("")) {
    115                 continue;
    116             }
    117 
    118             // Extracted metadata!
    119             String metadata_element_name = metadata_element_name_full;
    120 
    121             // We completely ignore bibliographic data
    122             if (metadata_element_name.equals("SourceSegment")) {
    123                 return new ArrayList();
    124             }
    125 
    126             // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
    127             //   and those starting with '/' (/srclink)
    128             char first_character = metadata_element_name.charAt(0);
    129             if (Character.isLowerCase(first_character) || first_character == '/') {
    130                 continue;
    131             }
    132 
    133             MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
    134 
    135             // Value trees are not stored for extracted metadata, so create a new value tree node now
    136             String current_metadata_element_value = XMLTools.getElementTextValue(current_metadata_element);
    137             metadata_element.addMetadataValue(current_metadata_element_value);
    138             MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(current_metadata_element_value);
    139 
    140             // Add the new metadata value to the list
    141             MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
    142             metadata_values.add(metadata_value);
    143             }
    144         }
    145         }
     147    catch (Exception ex) {
     148        System.err.println("Exception: " + ex);
     149        ex.printStackTrace();
    146150    }
    147151
     
    160164
    161165    // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
    162     Gatherer.println("Skimming doc.xml file " + this + "...");
     166    System.err.println("Skimming doc.xml file " + this + "...");
    163167    try {
    164168        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     169        int description_element_start = -1;
     170
    165171        String line = null;
    166         while ((line = buffered_reader.readLine()) != null) {
    167         // This line doesn't contain a metadata element
     172        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
     173        // This line contains the start of a Description element
     174        if (line.indexOf("<Description>") != -1) {
     175            if (description_element_start != -1) {
     176            System.err.println("Parse error: previous Description element unfinished!");
     177            }
     178            description_element_start = line_num;
     179            continue;
     180        }
     181
     182        // This line contains the end of a Description element
     183        if (line.indexOf("</Description>") != -1) {
     184            if (description_element_start == -1) {
     185            System.err.println("Parse error: Description element unstarted!");
     186            }
     187            description_element_start = -1;
     188            continue;
     189        }
     190
     191        // If we're not in a Description element there shouldn't be any Metadata elements
     192        if (description_element_start == -1) {
     193            continue;
     194        }
     195
     196        // This line doesn't contain a Metadata element, so we're not interested
    168197        if (line.indexOf("<Metadata ") == -1) {
     198            System.err.println("Parse error: Description element line doesn't contain Metadata element.");
    169199            continue;
    170200        }
     
    208238
    209239            // Remember this for quick access later
    210             files_in_doc_xml_file.add(gsdlsourcefilename_value);
     240            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
     241                source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
     242            }
     243
     244            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
    211245            }
    212246            else {
     
    218252        // We completely ignore bibliographic data
    219253        if (metadata_element_name.equals("SourceSegment")) {
    220             files_in_doc_xml_file.clear();
     254            // !!! source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
    221255        }
    222256
Note: See TracChangeset for help on using the changeset viewer.