Changeset 18362

Show
Ignore:
Timestamp:
12.01.2009 11:19:23 (10 years ago)
Author:
kjdon
Message:

updated the rtl-gli branch with files from trunk. Result of a merge 14807:18318

Location:
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata
Files:
4 modified
2 copied

Legend:

Unmodified
Added
Removed
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r12041 r18362  
    3030import java.io.*; 
    3131import java.util.*; 
     32import java.net.URLDecoder; 
    3233import org.greenstone.gatherer.DebugStream; 
    3334import org.greenstone.gatherer.util.Utility; 
     
    3536 
    3637/** This class represents one doc.xml file */ 
    37 public class DocXMLFile 
    38     extends File 
     38 
     39public abstract class DocXMLFile extends File 
    3940{ 
    40     private HashMap source_file_name_to_description_elements_mapping = new HashMap(); 
    41  
    42  
    43     public DocXMLFile(String doc_xml_file_path) 
     41    protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 
     42 
     43    protected final String MetadataWrap; 
     44    protected final String MetadataItem; 
     45 
     46    public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem) 
    4447    { 
    4548    super(doc_xml_file_path); 
     49    this.MetadataWrap = metaWrap; 
     50    this.MetadataItem = metaItem; 
    4651    } 
    4752 
     
    5863    } 
    5964 
    60     // Check whether this doc.xml file contains extracted metadata for the specified file 
     65    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 
    6166    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
    6267    if (description_elements_list == null) { 
     
    6772    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
    6873 
    69     // Parse the doc.xml file 
    70     DebugStream.println("Applicable doc.xml file: " + this); 
     74    // Parse the file 
     75    DebugStream.println("Applicable file: " + this); 
    7176    try { 
    7277        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     
    7883        String line = null; 
    7984        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
    80         // Check if this line contains the start of a relevant Description element 
     85        // Check if this line contains the start of a relevant "Description" element 
     86        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)  
    8187        if (line_num == next_description_element_start) { 
    8288            in_relevant_description_element = true; 
     
    9096 
    9197        // Check if this line contains the end of the relevant Description element 
    92         if (line.indexOf("</Description>") != -1) { 
     98        if (line.indexOf("</"+MetadataWrap+">") != -1) { 
    9399            description_element_num++; 
    94100            if (description_element_num == description_elements_list.size()) { 
     
    102108 
    103109        // If this line doesn't contain a complete Metadata element, we're not interested 
    104         if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 
     110        if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) { 
    105111            continue; 
    106112        } 
     
    134140        // Value trees are not stored for extracted metadata, so create a new value tree node now 
    135141        int value_index = line.indexOf(">", name_index) + ">".length(); 
    136         String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 
     142        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">")); 
    137143 
    138144        metadata_element.addMetadataValue(metadata_element_value); 
     
    157163 
    158164 
     165 
     166 
    159167    /** 
    160      * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 
     168     * Every file must be skimmed when a collection is opened, for two reasons: 
    161169     *   - To build a mapping from source file to its corresponding doc.xml file 
    162170     *   - To get a complete list of all extracted metadata elements 
     
    166174    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
    167175 
    168     // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 
    169     DebugStream.println("Skimming doc.xml file " + this + "..."); 
     176    // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements 
     177    DebugStream.println("Skimming " + this + "..."); 
    170178    try { 
    171179        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     
    174182        String line = null; 
    175183        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
    176         // This line contains the start of a Description element 
    177         if (line.indexOf("<Description>") != -1) { 
     184        // This line contains the start of a "MetadataWrap" element  
     185        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)  
     186        if (line.indexOf("<"+MetadataWrap+">") != -1) { 
    178187            if (description_element_start != -1) { 
    179             System.err.println("Parse error: previous Description element unfinished!"); 
     188            System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!"); 
    180189            } 
    181190            description_element_start = line_num; 
     
    183192        } 
    184193 
    185         // This line contains the end of a Description element 
    186         if (line.indexOf("</Description>") != -1) { 
     194        // This line contains the end of a "MetadataWrap" element 
     195        if (line.indexOf("</"+MetadataWrap+">") != -1) { 
    187196            if (description_element_start == -1) { 
    188             System.err.println("Parse error: Description element unstarted!"); 
     197            System.err.println("Parse error: "+MetadataWrap+" element unstarted!"); 
    189198            } 
    190199            description_element_start = -1; 
     
    192201        } 
    193202 
    194         // If we're not in a Description element there shouldn't be any Metadata elements 
     203        // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements 
    195204        if (description_element_start == -1) { 
    196205            continue; 
     
    198207 
    199208        // This line doesn't contain a Metadata element, so we're not interested 
    200         if (line.indexOf("<Metadata ") == -1) { 
    201             DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 
     209        if (line.indexOf("<"+MetadataItem+" ") == -1) { 
     210            DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element."); 
    202211            continue; 
    203212        } 
     
    216225        String metadata_element_name = metadata_element_name_full; 
    217226 
    218         // Note which file this doc.xml is for 
     227        // Note which file this is for 
    219228        if (metadata_element_name.equals("gsdlsourcefilename")) { 
    220229            // Extract the gsdlsourcefilename element value 
     
    229238            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 
    230239            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 
     240 
     241            // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 
     242            // This is stored in the System's file.encoding property. 
     243            gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));  
    231244 
    232245            // Make sure the path matches the OS that is running 
     
    248261            } 
    249262 
     263            // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or 
     264            // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file 
     265            // which are the gsdlsourcefilenames for the fedora digital object representing a collection. 
     266            // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example 
     267            else if (gsdlsourcefilename_value.indexOf("tmp") == -1  
     268                 && !gsdlsourcefilename_value.endsWith("collect.cfg") 
     269                 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) { 
     270            // We don't really know what is going on... 
     271            System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); 
     272            } 
     273        } 
     274 
     275        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 
     276        if (metadata_element_name.startsWith("gsdl")) { 
     277            continue; 
     278        } 
     279 
     280        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 
     281        if (metadata_element == null) { 
     282            // This element isn't defined in ex.mds, so create it for this session 
     283            DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); 
     284            extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); 
     285        } 
     286        } 
     287 
     288        buffered_reader.close(); 
     289    } 
     290    catch (FileNotFoundException exception) { 
     291        DebugStream.printStackTrace(exception); 
     292    } 
     293    catch (IOException exception) { 
     294        DebugStream.printStackTrace(exception); 
     295    } 
     296    } 
     297 
     298 
     299    /* 
     300    public ArrayList getMetadataExtractedFromFile(File file) 
     301    { 
     302    // Build up a list of metadata extracted from this file 
     303    ArrayList metadata_values = new ArrayList(); 
     304 
     305    String file_relative_path = file.getAbsolutePath(); 
     306    int import_index = file_relative_path.indexOf("import"); 
     307    if (import_index != -1) { 
     308        file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 
     309    } 
     310 
     311    // Check whether this doc.xml file contains extracted metadata for the specified file 
     312    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
     313    if (description_elements_list == null) { 
     314        // ...it doesn't 
     315        return metadata_values; 
     316    } 
     317 
     318    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
     319 
     320    // Parse the doc.xml file 
     321    DebugStream.println("Applicable doc.xml file: " + this); 
     322    try { 
     323        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     324 
     325        int description_element_num = 0; 
     326        int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 
     327        boolean in_relevant_description_element = false; 
     328 
     329        String line = null; 
     330        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
     331        // Check if this line contains the start of a relevant Description element 
     332        if (line_num == next_description_element_start) { 
     333            in_relevant_description_element = true; 
     334            continue; 
     335        } 
     336 
     337        // If we're not in a relevant Description element we don't care about anything 
     338        if (in_relevant_description_element == false) { 
     339            continue; 
     340        } 
     341 
     342        // Check if this line contains the end of the relevant Description element 
     343        if (line.indexOf("</Description>") != -1) { 
     344            description_element_num++; 
     345            if (description_element_num == description_elements_list.size()) { 
     346            break; 
     347            } 
     348 
     349            next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 
     350            in_relevant_description_element = false; 
     351            continue; 
     352        } 
     353 
     354        // If this line doesn't contain a complete Metadata element, we're not interested 
     355        if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 
     356            continue; 
     357        } 
     358 
     359        // Extract the metadata element name 
     360        int name_index = line.indexOf(" name=\"") + " name=\"".length(); 
     361        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 
     362 
     363        // If the metadata has a namespace it isn't extracted metadata, so we're not interested 
     364        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 
     365        if (!metadata_set_namespace.equals("")) { 
     366            continue; 
     367        } 
     368 
     369        // Extracted metadata! 
     370        String metadata_element_name = metadata_element_name_full; 
     371 
     372        // We completely ignore bibliographic data 
     373        if (metadata_element_name.equals("SourceSegment")) { 
     374            buffered_reader.close(); 
     375            return new ArrayList(); 
     376        } 
     377 
     378        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 
     379        if (metadata_element_name.startsWith("gsdl")) { 
     380            continue; 
     381        } 
     382 
     383        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 
     384 
     385        // Value trees are not stored for extracted metadata, so create a new value tree node now 
     386        int value_index = line.indexOf(">", name_index) + ">".length(); 
     387        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 
     388 
     389        metadata_element.addMetadataValue(metadata_element_value); 
     390        MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); 
     391 
     392        // Add the new metadata value to the list 
     393        MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 
     394        metadata_values.add(metadata_value); 
     395        } 
     396 
     397        buffered_reader.close(); 
     398    } 
     399    catch (FileNotFoundException exception) { 
     400        DebugStream.printStackTrace(exception); 
     401    } 
     402    catch (IOException exception) { 
     403        DebugStream.printStackTrace(exception); 
     404    } 
     405 
     406    return metadata_values; 
     407    } 
     408 
     409    */ 
     410 
     411    /** 
     412     * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 
     413     *   - To build a mapping from source file to its corresponding doc.xml file 
     414     *   - To get a complete list of all extracted metadata elements 
     415     */ 
     416    /* 
     417    public void skimFile() 
     418    { 
     419    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
     420 
     421    // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 
     422    DebugStream.println("Skimming " + this + "..."); 
     423    try { 
     424        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     425        int description_element_start = -1; 
     426 
     427        String line = null; 
     428        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
     429        // This line contains the start of a Description element 
     430        if (line.indexOf("<Description>") != -1) { 
     431            if (description_element_start != -1) { 
     432            System.err.println("Parse error: previous Description element unfinished!"); 
     433            } 
     434            description_element_start = line_num; 
     435            continue; 
     436        } 
     437 
     438        // This line contains the end of a Description element 
     439        if (line.indexOf("</Description>") != -1) { 
     440            if (description_element_start == -1) { 
     441            System.err.println("Parse error: Description element unstarted!"); 
     442            } 
     443            description_element_start = -1; 
     444            continue; 
     445        } 
     446 
     447        // If we're not in a Description element there shouldn't be any Metadata elements 
     448        if (description_element_start == -1) { 
     449            continue; 
     450        } 
     451 
     452        // This line doesn't contain a Metadata element, so we're not interested 
     453        if (line.indexOf("<Metadata ") == -1) { 
     454            DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 
     455            continue; 
     456        } 
     457 
     458        // Extract the metadata element name 
     459        int name_index = line.indexOf(" name=\"") + " name=\"".length(); 
     460        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 
     461 
     462        // If the metadata has a namespace it isn't extracted metadata, so we're not interested 
     463        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 
     464        if (!metadata_set_namespace.equals("")) { 
     465            continue; 
     466        } 
     467 
     468        // Extracted metadata! 
     469        String metadata_element_name = metadata_element_name_full; 
     470 
     471        // Note which file this doc.xml is for 
     472        if (metadata_element_name.equals("gsdlsourcefilename")) { 
     473            // Extract the gsdlsourcefilename element value 
     474            int value_index = line.indexOf(">", name_index) + ">".length(); 
     475            String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 
     476 
     477            // We're only interested in the path relative to the import folder 
     478            int import_index = gsdlsourcefilename_value.indexOf("import"); 
     479            if (import_index != -1) { 
     480            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 
     481 
     482            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 
     483            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 
     484 
     485            // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 
     486            // This is stored in the System's file.encoding property. 
     487            gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));  
     488 
     489            // Make sure the path matches the OS that is running 
     490            if (is_unix_path && Utility.isWindows()) { 
     491                // Convert path from Unix to Windows 
     492                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 
     493            } 
     494            else if (!is_unix_path && !Utility.isWindows()) { 
     495                // Convert path from Windows to Unix 
     496                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 
     497            } 
     498 
     499            // Remember this for quick access later 
     500            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 
     501                source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 
     502            } 
     503 
     504            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 
     505            } 
     506 
    250507            // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory 
    251508            // This is true when the source files come from a zip file processed by ZIPPlug, for example 
     
    278535    } 
    279536    } 
     537    */ 
     538 
    280539} 
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java

    r13337 r18362  
    6060 
    6161 
    62     static public void loadDocXMLFiles(File directory) 
     62    static public void loadDocXMLFiles(File directory, String filename_match) 
    6363    { 
    6464    // Make sure the directory (archives) exists 
     
    7272        File child_file = directory_files[i]; 
    7373        if (child_file.isDirectory()) { 
    74         loadDocXMLFiles(child_file); 
     74        loadDocXMLFiles(child_file,filename_match);  
    7575        } 
    76         else if (child_file.getName().equals("doc.xml")) { 
    77         loadDocXMLFile(child_file); 
     76        else if (child_file.getName().equals(filename_match)) { 
     77        // e.g. doc.xml (for regular Greenstone, docmets.xml for Fedora) 
     78 
     79        loadDocXMLFile(child_file,filename_match); 
    7880        } 
    7981    } 
     
    8183 
    8284 
    83     static private void loadDocXMLFile(File doc_xml_file_file) 
     85    static private void loadDocXMLFile(File doc_xml_file_file,String filename_match) 
    8486    { 
    85     DocXMLFile doc_xml_file = new DocXMLFile(doc_xml_file_file.getAbsolutePath()); 
    86     try { 
     87    String file = doc_xml_file_file.getAbsolutePath(); 
     88 
     89    // Need to do typecasts in the following to keep Java 1.4 happy 
     90    DocXMLFile doc_xml_file  
     91        = (filename_match.equals("docmets.xml"))  
     92        ? (DocXMLFile) new DocMetsXMLFile(file)  
     93        : (DocXMLFile) new DocGAFile(file); 
     94 
     95    try {        
    8796        doc_xml_file.skimFile(); 
    8897        doc_xml_files.add(doc_xml_file); 
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/MetadataXMLFileManager.java

    r13818 r18362  
    368368 
    369369        // Upload the files modified since last time, then reset the list 
    370         RemoteGreenstoneServer.uploadCollectionFiles(CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0])); 
     370        Gatherer.remoteGreenstoneServer.uploadCollectionFiles( 
     371                  CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0])); 
    371372        modified_metadata_xml_files.clear(); 
    372373    } 
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/ProfileXMLFile.java

    r13808 r18362  
    110110    // This is inefficient but for simplicity we'll just upload the file every time it is changed 
    111111    if (Gatherer.isGsdlRemote) { 
    112         RemoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this); 
     112        Gatherer.remoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this); 
    113113    } 
    114114