Changeset 17009

Show
Ignore:
Timestamp:
26.08.2008 17:23:26 (11 years ago)
Author:
davidb
Message:

Generalisation of skimming doc.xml files to find extracted metadata. Can now read docmets.xml files in export folder instead of doc.xml in archives folder.

Location:
gli/trunk/src/org/greenstone/gatherer
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • gli/trunk/src/org/greenstone/gatherer/collection/CollectionManager.java

    r16987 r17009  
    13151315 
    13161316        // Read through the doc.xml files in the archives directory 
    1317         File collection_archives_directory = new File(getLoadedCollectionArchivesDirectoryPath()); 
     1317 
    13181318        DocXMLFileManager.clearDocXMLFiles(); 
    1319         DocXMLFileManager.loadDocXMLFiles(collection_archives_directory); 
     1319 
     1320        if (Configuration.fedora_info.isActive()) { // FLI case 
     1321        File collection_export_directory = new File(getLoadedCollectionExportDirectoryPath()); 
     1322        DocXMLFileManager.loadDocXMLFiles(collection_export_directory,"docmets.xml"); 
     1323        } 
     1324        else { 
     1325        File collection_archives_directory = new File(getLoadedCollectionArchivesDirectoryPath()); 
     1326        DocXMLFileManager.loadDocXMLFiles(collection_archives_directory,"doc.xml"); 
     1327        } 
     1328 
    13201329 
    13211330        // Get a list of the collection specific classifiers and plugins 
     
    13581367     * more situations. */ 
    13591368    public static boolean canDoScheduling() { 
     1369    // Would be nice to support more of these, rather than returning false 
    13601370    if(Gatherer.isGsdlRemote) { 
    13611371        return false; 
    13621372    }  
    13631373    if(Gatherer.GS3) { 
     1374        return false; 
     1375    } 
     1376    if (Configuration.fedora_info.isActive()) { 
    13641377        return false; 
    13651378    } 
  • gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r16671 r17009  
    3636 
    3737/** This class represents one doc.xml file */ 
    38 public class DocXMLFile 
    39     extends File 
     38 
     39public abstract class DocXMLFile extends File 
    4040{ 
    41     private HashMap source_file_name_to_description_elements_mapping = new HashMap(); 
    42  
     41    protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 
     42 
     43    protected static String MetadataWrap = null; 
     44    protected static String MetadataItem = null; 
    4345 
    4446    public DocXMLFile(String doc_xml_file_path) 
     
    5961    } 
    6062 
    61     // Check whether this doc.xml file contains extracted metadata for the specified file 
     63    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 
    6264    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
    6365    if (description_elements_list == null) { 
     
    6870    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
    6971 
    70     // Parse the doc.xml file 
    71     DebugStream.println("Applicable doc.xml file: " + this); 
     72    // Parse the file 
     73    DebugStream.println("Applicable file: " + this); 
    7274    try { 
    7375        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     
    7981        String line = null; 
    8082        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
    81         // Check if this line contains the start of a relevant Description element 
     83        // Check if this line contains the start of a relevant "Description" element 
     84        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)  
    8285        if (line_num == next_description_element_start) { 
    8386            in_relevant_description_element = true; 
     
    9194 
    9295        // Check if this line contains the end of the relevant Description element 
    93         if (line.indexOf("</Description>") != -1) { 
     96        if (line.indexOf("</"+MetadataWrap+">") != -1) { 
    9497            description_element_num++; 
    9598            if (description_element_num == description_elements_list.size()) { 
     
    103106 
    104107        // If this line doesn't contain a complete Metadata element, we're not interested 
    105         if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 
     108        if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) { 
    106109            continue; 
    107110        } 
     
    135138        // Value trees are not stored for extracted metadata, so create a new value tree node now 
    136139        int value_index = line.indexOf(">", name_index) + ">".length(); 
    137         String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 
     140        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">")); 
    138141 
    139142        metadata_element.addMetadataValue(metadata_element_value); 
     
    158161 
    159162 
     163 
     164 
    160165    /** 
    161      * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 
     166     * Every file must be skimmed when a collection is opened, for two reasons: 
    162167     *   - To build a mapping from source file to its corresponding doc.xml file 
    163168     *   - To get a complete list of all extracted metadata elements 
     
    167172    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
    168173 
    169     // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 
    170     DebugStream.println("Skimming doc.xml file " + this + "..."); 
     174    // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements 
     175    DebugStream.println("Skimming " + this + "..."); 
    171176    try { 
    172177        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     
    175180        String line = null; 
    176181        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
    177         // This line contains the start of a Description element 
    178         if (line.indexOf("<Description>") != -1) { 
     182        // This line contains the start of a "MetadataWrap" element  
     183        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)  
     184        if (line.indexOf("<"+MetadataWrap+">") != -1) { 
    179185            if (description_element_start != -1) { 
    180             System.err.println("Parse error: previous Description element unfinished!"); 
     186            System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!"); 
    181187            } 
    182188            description_element_start = line_num; 
     
    184190        } 
    185191 
    186         // This line contains the end of a Description element 
    187         if (line.indexOf("</Description>") != -1) { 
     192        // This line contains the end of a "MetadataWrap" element 
     193        if (line.indexOf("</"+MetadataWrap+">") != -1) { 
    188194            if (description_element_start == -1) { 
    189             System.err.println("Parse error: Description element unstarted!"); 
     195            System.err.println("Parse error: "+MetadataWrap+" element unstarted!"); 
    190196            } 
    191197            description_element_start = -1; 
     
    193199        } 
    194200 
    195         // If we're not in a Description element there shouldn't be any Metadata elements 
     201        // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements 
    196202        if (description_element_start == -1) { 
    197203            continue; 
     
    199205 
    200206        // This line doesn't contain a Metadata element, so we're not interested 
    201         if (line.indexOf("<Metadata ") == -1) { 
    202             DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 
     207        if (line.indexOf("<"+MetadataItem+" ") == -1) { 
     208            DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element."); 
    203209            continue; 
    204210        } 
     
    217223        String metadata_element_name = metadata_element_name_full; 
    218224 
    219         // Note which file this doc.xml is for 
     225        // Note which file this is for 
    220226        if (metadata_element_name.equals("gsdlsourcefilename")) { 
    221227            // Extract the gsdlsourcefilename element value 
     
    283289    } 
    284290    } 
     291 
     292 
     293    /* 
     294    public ArrayList getMetadataExtractedFromFile(File file) 
     295    { 
     296    // Build up a list of metadata extracted from this file 
     297    ArrayList metadata_values = new ArrayList(); 
     298 
     299    String file_relative_path = file.getAbsolutePath(); 
     300    int import_index = file_relative_path.indexOf("import"); 
     301    if (import_index != -1) { 
     302        file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 
     303    } 
     304 
     305    // Check whether this doc.xml file contains extracted metadata for the specified file 
     306    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
     307    if (description_elements_list == null) { 
     308        // ...it doesn't 
     309        return metadata_values; 
     310    } 
     311 
     312    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
     313 
     314    // Parse the doc.xml file 
     315    DebugStream.println("Applicable doc.xml file: " + this); 
     316    try { 
     317        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     318 
     319        int description_element_num = 0; 
     320        int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 
     321        boolean in_relevant_description_element = false; 
     322 
     323        String line = null; 
     324        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
     325        // Check if this line contains the start of a relevant Description element 
     326        if (line_num == next_description_element_start) { 
     327            in_relevant_description_element = true; 
     328            continue; 
     329        } 
     330 
     331        // If we're not in a relevant Description element we don't care about anything 
     332        if (in_relevant_description_element == false) { 
     333            continue; 
     334        } 
     335 
     336        // Check if this line contains the end of the relevant Description element 
     337        if (line.indexOf("</Description>") != -1) { 
     338            description_element_num++; 
     339            if (description_element_num == description_elements_list.size()) { 
     340            break; 
     341            } 
     342 
     343            next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 
     344            in_relevant_description_element = false; 
     345            continue; 
     346        } 
     347 
     348        // If this line doesn't contain a complete Metadata element, we're not interested 
     349        if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 
     350            continue; 
     351        } 
     352 
     353        // Extract the metadata element name 
     354        int name_index = line.indexOf(" name=\"") + " name=\"".length(); 
     355        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 
     356 
     357        // If the metadata has a namespace it isn't extracted metadata, so we're not interested 
     358        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 
     359        if (!metadata_set_namespace.equals("")) { 
     360            continue; 
     361        } 
     362 
     363        // Extracted metadata! 
     364        String metadata_element_name = metadata_element_name_full; 
     365 
     366        // We completely ignore bibliographic data 
     367        if (metadata_element_name.equals("SourceSegment")) { 
     368            buffered_reader.close(); 
     369            return new ArrayList(); 
     370        } 
     371 
     372        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 
     373        if (metadata_element_name.startsWith("gsdl")) { 
     374            continue; 
     375        } 
     376 
     377        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 
     378 
     379        // Value trees are not stored for extracted metadata, so create a new value tree node now 
     380        int value_index = line.indexOf(">", name_index) + ">".length(); 
     381        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 
     382 
     383        metadata_element.addMetadataValue(metadata_element_value); 
     384        MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); 
     385 
     386        // Add the new metadata value to the list 
     387        MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 
     388        metadata_values.add(metadata_value); 
     389        } 
     390 
     391        buffered_reader.close(); 
     392    } 
     393    catch (FileNotFoundException exception) { 
     394        DebugStream.printStackTrace(exception); 
     395    } 
     396    catch (IOException exception) { 
     397        DebugStream.printStackTrace(exception); 
     398    } 
     399 
     400    return metadata_values; 
     401    } 
     402 
     403    */ 
     404 
     405    /** 
     406     * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 
     407     *   - To build a mapping from source file to its corresponding doc.xml file 
     408     *   - To get a complete list of all extracted metadata elements 
     409     */ 
     410    /* 
     411    public void skimFile() 
     412    { 
     413    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 
     414 
     415    // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 
     416    DebugStream.println("Skimming " + this + "..."); 
     417    try { 
     418        BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 
     419        int description_element_start = -1; 
     420 
     421        String line = null; 
     422        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 
     423        // This line contains the start of a Description element 
     424        if (line.indexOf("<Description>") != -1) { 
     425            if (description_element_start != -1) { 
     426            System.err.println("Parse error: previous Description element unfinished!"); 
     427            } 
     428            description_element_start = line_num; 
     429            continue; 
     430        } 
     431 
     432        // This line contains the end of a Description element 
     433        if (line.indexOf("</Description>") != -1) { 
     434            if (description_element_start == -1) { 
     435            System.err.println("Parse error: Description element unstarted!"); 
     436            } 
     437            description_element_start = -1; 
     438            continue; 
     439        } 
     440 
     441        // If we're not in a Description element there shouldn't be any Metadata elements 
     442        if (description_element_start == -1) { 
     443            continue; 
     444        } 
     445 
     446        // This line doesn't contain a Metadata element, so we're not interested 
     447        if (line.indexOf("<Metadata ") == -1) { 
     448            DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 
     449            continue; 
     450        } 
     451 
     452        // Extract the metadata element name 
     453        int name_index = line.indexOf(" name=\"") + " name=\"".length(); 
     454        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 
     455 
     456        // If the metadata has a namespace it isn't extracted metadata, so we're not interested 
     457        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 
     458        if (!metadata_set_namespace.equals("")) { 
     459            continue; 
     460        } 
     461 
     462        // Extracted metadata! 
     463        String metadata_element_name = metadata_element_name_full; 
     464 
     465        // Note which file this doc.xml is for 
     466        if (metadata_element_name.equals("gsdlsourcefilename")) { 
     467            // Extract the gsdlsourcefilename element value 
     468            int value_index = line.indexOf(">", name_index) + ">".length(); 
     469            String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 
     470 
     471            // We're only interested in the path relative to the import folder 
     472            int import_index = gsdlsourcefilename_value.indexOf("import"); 
     473            if (import_index != -1) { 
     474            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 
     475 
     476            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 
     477            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 
     478 
     479            // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 
     480            // This is stored in the System's file.encoding property. 
     481            gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));  
     482 
     483            // Make sure the path matches the OS that is running 
     484            if (is_unix_path && Utility.isWindows()) { 
     485                // Convert path from Unix to Windows 
     486                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 
     487            } 
     488            else if (!is_unix_path && !Utility.isWindows()) { 
     489                // Convert path from Windows to Unix 
     490                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 
     491            } 
     492 
     493            // Remember this for quick access later 
     494            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 
     495                source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 
     496            } 
     497 
     498            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 
     499            } 
     500 
     501            // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory 
     502            // This is true when the source files come from a zip file processed by ZIPPlug, for example 
     503            else if (gsdlsourcefilename_value.indexOf("tmp") == -1) { 
     504            // We don't really know what is going on... 
     505            System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); 
     506            } 
     507        } 
     508 
     509        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 
     510        if (metadata_element_name.startsWith("gsdl")) { 
     511            continue; 
     512        } 
     513 
     514        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 
     515        if (metadata_element == null) { 
     516            // This element isn't defined in ex.mds, so create it for this session 
     517            DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); 
     518            extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); 
     519        } 
     520        } 
     521 
     522        buffered_reader.close(); 
     523    } 
     524    catch (FileNotFoundException exception) { 
     525        DebugStream.printStackTrace(exception); 
     526    } 
     527    catch (IOException exception) { 
     528        DebugStream.printStackTrace(exception); 
     529    } 
     530    } 
     531    */ 
     532 
    285533} 
  • gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java

    r13337 r17009  
    6060 
    6161 
    62     static public void loadDocXMLFiles(File directory) 
     62    static public void loadDocXMLFiles(File directory, String filename_match) 
    6363    { 
    6464    // Make sure the directory (archives) exists 
     
    7272        File child_file = directory_files[i]; 
    7373        if (child_file.isDirectory()) { 
    74         loadDocXMLFiles(child_file); 
     74        loadDocXMLFiles(child_file,filename_match);  
    7575        } 
    76         else if (child_file.getName().equals("doc.xml")) { 
    77         loadDocXMLFile(child_file); 
     76        else if (child_file.getName().equals(filename_match)) { 
     77        // e.g. doc.xml (for regular Greenstone, docmets.xml for Fedora) 
     78 
     79        loadDocXMLFile(child_file,filename_match); 
    7880        } 
    7981    } 
     
    8183 
    8284 
    83     static private void loadDocXMLFile(File doc_xml_file_file) 
     85    static private void loadDocXMLFile(File doc_xml_file_file,String filename_match) 
    8486    { 
    85     DocXMLFile doc_xml_file = new DocXMLFile(doc_xml_file_file.getAbsolutePath()); 
    86     try { 
     87    String file = doc_xml_file_file.getAbsolutePath(); 
     88 
     89    DocXMLFile doc_xml_file  
     90        = (filename_match.equals("docmets.xml")) ? new DocMetsXMLFile(file) : new DocGAFile(file); 
     91 
     92    try {        
    8793        doc_xml_file.skimFile(); 
    8894        doc_xml_files.add(doc_xml_file); 
  • gli/trunk/src/org/greenstone/gatherer/shell/GShell.java

    r16129 r17009  
    405405        fireMessage(type, typeAsString(type) + "> " + Dictionary.get("GShell.Parsing_Metadata_Start"), status, null); 
    406406        DocXMLFileManager.clearDocXMLFiles(); 
    407         DocXMLFileManager.loadDocXMLFiles(new File(CollectionManager.getLoadedCollectionArchivesDirectoryPath())); 
     407        if (Configuration.fedora_info.isActive()) { // FLI case 
     408            File collection_export_directory = new File(CollectionManager.getLoadedCollectionExportDirectoryPath()); 
     409            DocXMLFileManager.loadDocXMLFiles(collection_export_directory,"docmets.xml"); 
     410        } 
     411        else { 
     412            File collection_archives_directory = new File(CollectionManager.getLoadedCollectionArchivesDirectoryPath()); 
     413            DocXMLFileManager.loadDocXMLFiles(collection_archives_directory,"doc.xml"); 
     414        } 
     415 
     416 
    408417        fireMessage(type, typeAsString(type) + "> " + Dictionary.get("GShell.Parsing_Metadata_Complete"), status, null); 
    409418        }