Ignore:
Timestamp:
2004-09-22T11:53:21+12:00 (20 years ago)
Author:
mdewsnip
Message:

More improvements to the new metadata code, including language-specific metadata element display and a 5x speed up in the skimming of the doc.xml files.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r8123 r8131  
    2525    super(doc_xml_file_path);
    2626
    27     // Parse the doc.xml file
    28     System.err.println("Loading doc.xml file " + doc_xml_file_path + "...");
    29     Document document = XMLTools.parseXMLFile(this);
    30     if (document == null) {
    31         System.err.println("Error: Could not parse doc.xml file " + getAbsolutePath());
    32         return;
    33     }
    34 
    3527    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
    3628
    37     // Read all the Archive elements in the file
    38     NodeList archive_elements_nodelist = document.getElementsByTagName(ARCHIVE_ELEMENT);
    39     for (int i = 0; i < archive_elements_nodelist.getLength(); i++) {
    40         Element current_archive_element = (Element) archive_elements_nodelist.item(i);
     29    // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
     30    System.err.println("Skimming doc.xml file " + this + "...");
     31    try {
     32        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     33        String line = null;
     34        while ((line = buffered_reader.readLine()) != null) {
     35        // This line doesn't contain a metadata element
     36        if (line.indexOf("<Metadata ") == -1) {
     37            continue;
     38        }
    4139
    42         // Read the child Section elements of the archive (but not all descendants)
    43         ArrayList child_section_elements = XMLTools.getChildElementsByTagName(current_archive_element, SECTION_ELEMENT);
    44         for (int j = 0; j < child_section_elements.size(); j++) {
    45         Element current_section_element = (Element) child_section_elements.get(j);
     40        // Extract the metadata element name
     41        int name_index = line.indexOf(" name=\"") + " name=\"".length();
     42        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
    4643
    47         // Read the Description elements of this section only (not child sections as well)
    48         ArrayList child_description_elements = XMLTools.getChildElementsByTagName(current_section_element, DESCRIPTION_ELEMENT);
    49         for (int k = 0; k < child_description_elements.size(); k++) {
    50             Element current_description_element = (Element) child_description_elements.get(k);
     44        // If the metadata has a namespace it isn't extracted metadata, so we're not interested
     45        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
     46        if (!metadata_set_namespace.equals("")) {
     47            continue;
     48        }
    5149
    52             String gsdlsourcefilename_value = null;
    53             boolean bibliographic_data = false;
     50        // Extracted metadata!
     51        String metadata_element_name = metadata_element_name_full;
    5452
    55             // Read all the Metadata elements in this description element
    56             NodeList metadata_elements_nodelist = current_description_element.getElementsByTagName(METADATA_ELEMENT);
    57             for (int l = 0; l < metadata_elements_nodelist.getLength(); l++) {
    58             Element current_metadata_element = (Element) metadata_elements_nodelist.item(l);
    59             String metadata_element_name_full = current_metadata_element.getAttribute("name");
     53        // Note which file this doc.xml is for
     54        if (metadata_element_name.equals("gsdlsourcefilename")) {
     55            // Extract the gsdlsourcefilename element value
     56            int value_index = line.indexOf(">", name_index) + ">".length();
     57            String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
    6058
    61             // If the metadata has a namespace it isn't extracted metadata, so we're not interested
    62             String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
    63             if (!metadata_set_namespace.equals("")) {
    64                 continue;
     59            // We're only interested in the path relative to the import folder
     60            int import_index = gsdlsourcefilename_value.indexOf("import");
     61            if (import_index != -1) {
     62            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
     63
     64            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
     65            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
     66
     67            // Make sure the path matches the OS that is running
     68            if (is_unix_path && Utility.isWindows()) {
     69                // Convert path from Unix to Windows
     70                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("/", File.separator);
     71            }
     72            if (!is_unix_path && !Utility.isWindows()) {
     73                // Convert path from Windows to Unix
     74                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", File.separator);
    6575            }
    6676
    67             // Extracted metadata!
    68             String metadata_element_name = metadata_element_name_full;
    69 
    70             // Note which file this Section is for
    71             if (metadata_element_name.equals("gsdlsourcefilename")) {
    72                 gsdlsourcefilename_value = XMLTools.getElementTextValue(current_metadata_element);
    73 
    74                 // We're only interested in the path relative to the import folder
    75                 int import_index = gsdlsourcefilename_value.indexOf("import");
    76                 if (import_index != -1) {
    77                 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
    78 
    79                 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
    80                 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
    81 
    82                 // Make sure the path matches the OS that is running
    83                 if (is_unix_path && Utility.isWindows()) {
    84                     // Convert path from Unix to Windows
    85                     gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("/", File.separator);
    86                 }
    87                 if (!is_unix_path && !Utility.isWindows()) {
    88                     // Convert path from Windows to Unix
    89                     gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", File.separator);
    90                 }
    91                 }
    92                 else {
    93                 // We don't really know what is going on...
    94                 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
    95                 }
    96             }
    97 
    98             // We don't do much with bibliographic data
    99             if (metadata_element_name.equals("SourceSegment")) {
    100                 bibliographic_data = true;
    101             }
    102 
    103             // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
    104             //   and those starting with '/' (/srclink)
    105             char first_character = metadata_element_name.charAt(0);
    106             if (Character.isLowerCase(first_character) || first_character == '/') {
    107                 continue;
    108             }
    109 
    110             MetadataElement metadata_element = extracted_metadata_set.getMetadataElement(metadata_element_name);
    111             if (metadata_element == null) {
    112                 // This element isn't defined in ex.mds, so create it for this session
    113                 System.err.println("Extracted metadata element not defined: " + metadata_element_name);
    114                 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
     77            // Remember this for quick access later
     78            if (gsdlsourcefilename_value != null) {
     79                files_in_doc_xml_file.add(gsdlsourcefilename_value);
    11580            }
    11681            }
    117 
    118             // Remember this for quick access later
    119             if (gsdlsourcefilename_value != null && !bibliographic_data) {
    120             files_in_doc_xml_file.add(gsdlsourcefilename_value);
     82            else {
     83            // We don't really know what is going on...
     84            System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
    12185            }
    12286        }
     87
     88        // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
     89        //   and those starting with '/' (/srclink)
     90        char first_character = metadata_element_name.charAt(0);
     91        if (Character.isLowerCase(first_character) || first_character == '/') {
     92            continue;
     93        }
     94
     95        MetadataElement metadata_element = extracted_metadata_set.getMetadataElement(metadata_element_name);
     96        if (metadata_element == null) {
     97            // This element isn't defined in ex.mds, so create it for this session
     98            System.err.println("Extracted metadata element not defined: " + metadata_element_name);
     99            extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
     100        }
    123101        }
     102    }
     103    catch (Exception ex) {
     104        System.err.println("Exception: " + ex);
     105        ex.printStackTrace();
    124106    }
    125107    }
Note: See TracChangeset for help on using the changeset viewer.