Changeset 18362


Ignore:
Timestamp:
01/12/09 11:19:23 (12 years ago)
Author:
kjdon
Message:

updated the rtl-gli branch with files from trunk. Result of a merge 14807:18318

Location:
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata
Files:
4 edited
2 copied

Legend:

Unmodified
Added
Removed
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r12041 r18362  
    3030import java.io.*;
    3131import java.util.*;
     32import java.net.URLDecoder;
    3233import org.greenstone.gatherer.DebugStream;
    3334import org.greenstone.gatherer.util.Utility;
     
    3536
    3637/** This class represents one doc.xml file */
    37 public class DocXMLFile
    38     extends File
     38
     39public abstract class DocXMLFile extends File
    3940{
    40     private HashMap source_file_name_to_description_elements_mapping = new HashMap();
    41 
    42 
    43     public DocXMLFile(String doc_xml_file_path)
     41    protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
     42
     43    protected final String MetadataWrap;
     44    protected final String MetadataItem;
     45
     46    public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
    4447    {
    4548    super(doc_xml_file_path);
     49    this.MetadataWrap = metaWrap;
     50    this.MetadataItem = metaItem;
    4651    }
    4752
     
    5863    }
    5964
    60     // Check whether this doc.xml file contains extracted metadata for the specified file
     65    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
    6166    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
    6267    if (description_elements_list == null) {
     
    6772    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
    6873
    69     // Parse the doc.xml file
    70     DebugStream.println("Applicable doc.xml file: " + this);
     74    // Parse the file
     75    DebugStream.println("Applicable file: " + this);
    7176    try {
    7277        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     
    7883        String line = null;
    7984        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
    80         // Check if this line contains the start of a relevant Description element
     85        // Check if this line contains the start of a relevant "Description" element
     86        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
    8187        if (line_num == next_description_element_start) {
    8288            in_relevant_description_element = true;
     
    9096
    9197        // Check if this line contains the end of the relevant Description element
    92         if (line.indexOf("</Description>") != -1) {
     98        if (line.indexOf("</"+MetadataWrap+">") != -1) {
    9399            description_element_num++;
    94100            if (description_element_num == description_elements_list.size()) {
     
    102108
    103109        // If this line doesn't contain a complete Metadata element, we're not interested
    104         if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
     110        if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
    105111            continue;
    106112        }
     
    134140        // Value trees are not stored for extracted metadata, so create a new value tree node now
    135141        int value_index = line.indexOf(">", name_index) + ">".length();
    136         String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
     142        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
    137143
    138144        metadata_element.addMetadataValue(metadata_element_value);
     
    157163
    158164
     165
     166
    159167    /**
    160      * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
     168     * Every file must be skimmed when a collection is opened, for two reasons:
    161169     *   - To build a mapping from source file to its corresponding doc.xml file
    162170     *   - To get a complete list of all extracted metadata elements
     
    166174    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
    167175
    168     // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
    169     DebugStream.println("Skimming doc.xml file " + this + "...");
     176    // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
     177    DebugStream.println("Skimming " + this + "...");
    170178    try {
    171179        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     
    174182        String line = null;
    175183        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
    176         // This line contains the start of a Description element
    177         if (line.indexOf("<Description>") != -1) {
     184        // This line contains the start of a "MetadataWrap" element
     185        // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
     186        if (line.indexOf("<"+MetadataWrap+">") != -1) {
    178187            if (description_element_start != -1) {
    179             System.err.println("Parse error: previous Description element unfinished!");
     188            System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
    180189            }
    181190            description_element_start = line_num;
     
    183192        }
    184193
    185         // This line contains the end of a Description element
    186         if (line.indexOf("</Description>") != -1) {
     194        // This line contains the end of a "MetadataWrap" element
     195        if (line.indexOf("</"+MetadataWrap+">") != -1) {
    187196            if (description_element_start == -1) {
    188             System.err.println("Parse error: Description element unstarted!");
     197            System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
    189198            }
    190199            description_element_start = -1;
     
    192201        }
    193202
    194         // If we're not in a Description element there shouldn't be any Metadata elements
     203        // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
    195204        if (description_element_start == -1) {
    196205            continue;
     
    198207
    199208        // This line doesn't contain a Metadata element, so we're not interested
    200         if (line.indexOf("<Metadata ") == -1) {
    201             DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
     209        if (line.indexOf("<"+MetadataItem+" ") == -1) {
     210            DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
    202211            continue;
    203212        }
     
    216225        String metadata_element_name = metadata_element_name_full;
    217226
    218         // Note which file this doc.xml is for
     227        // Note which file this is for
    219228        if (metadata_element_name.equals("gsdlsourcefilename")) {
    220229            // Extract the gsdlsourcefilename element value
     
    229238            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
    230239            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
     240
     241            // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
     242            // This is stored in the System's file.encoding property.
     243            gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
    231244
    232245            // Make sure the path matches the OS that is running
     
    248261            }
    249262
     263            // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
     264            // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
     265            // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
     266            // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
     267            else if (gsdlsourcefilename_value.indexOf("tmp") == -1
     268                 && !gsdlsourcefilename_value.endsWith("collect.cfg")
     269                 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
     270            // We don't really know what is going on...
     271            System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
     272            }
     273        }
     274
     275        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
     276        if (metadata_element_name.startsWith("gsdl")) {
     277            continue;
     278        }
     279
     280        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
     281        if (metadata_element == null) {
     282            // This element isn't defined in ex.mds, so create it for this session
     283            DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
     284            extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
     285        }
     286        }
     287
     288        buffered_reader.close();
     289    }
     290    catch (FileNotFoundException exception) {
     291        DebugStream.printStackTrace(exception);
     292    }
     293    catch (IOException exception) {
     294        DebugStream.printStackTrace(exception);
     295    }
     296    }
     297
     298
     299    /*
     300    public ArrayList getMetadataExtractedFromFile(File file)
     301    {
     302    // Build up a list of metadata extracted from this file
     303    ArrayList metadata_values = new ArrayList();
     304
     305    String file_relative_path = file.getAbsolutePath();
     306    int import_index = file_relative_path.indexOf("import");
     307    if (import_index != -1) {
     308        file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
     309    }
     310
     311    // Check whether this doc.xml file contains extracted metadata for the specified file
     312    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     313    if (description_elements_list == null) {
     314        // ...it doesn't
     315        return metadata_values;
     316    }
     317
     318    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
     319
     320    // Parse the doc.xml file
     321    DebugStream.println("Applicable doc.xml file: " + this);
     322    try {
     323        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     324
     325        int description_element_num = 0;
     326        int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
     327        boolean in_relevant_description_element = false;
     328
     329        String line = null;
     330        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
     331        // Check if this line contains the start of a relevant Description element
     332        if (line_num == next_description_element_start) {
     333            in_relevant_description_element = true;
     334            continue;
     335        }
     336
     337        // If we're not in a relevant Description element we don't care about anything
     338        if (in_relevant_description_element == false) {
     339            continue;
     340        }
     341
     342        // Check if this line contains the end of the relevant Description element
     343        if (line.indexOf("</Description>") != -1) {
     344            description_element_num++;
     345            if (description_element_num == description_elements_list.size()) {
     346            break;
     347            }
     348
     349            next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
     350            in_relevant_description_element = false;
     351            continue;
     352        }
     353
     354        // If this line doesn't contain a complete Metadata element, we're not interested
     355        if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
     356            continue;
     357        }
     358
     359        // Extract the metadata element name
     360        int name_index = line.indexOf(" name=\"") + " name=\"".length();
     361        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
     362
     363        // If the metadata has a namespace it isn't extracted metadata, so we're not interested
     364        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
     365        if (!metadata_set_namespace.equals("")) {
     366            continue;
     367        }
     368
     369        // Extracted metadata!
     370        String metadata_element_name = metadata_element_name_full;
     371
     372        // We completely ignore bibliographic data
     373        if (metadata_element_name.equals("SourceSegment")) {
     374            buffered_reader.close();
     375            return new ArrayList();
     376        }
     377
     378        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
     379        if (metadata_element_name.startsWith("gsdl")) {
     380            continue;
     381        }
     382
     383        MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
     384
     385        // Value trees are not stored for extracted metadata, so create a new value tree node now
     386        int value_index = line.indexOf(">", name_index) + ">".length();
     387        String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
     388
     389        metadata_element.addMetadataValue(metadata_element_value);
     390        MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
     391
     392        // Add the new metadata value to the list
     393        MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
     394        metadata_values.add(metadata_value);
     395        }
     396
     397        buffered_reader.close();
     398    }
     399    catch (FileNotFoundException exception) {
     400        DebugStream.printStackTrace(exception);
     401    }
     402    catch (IOException exception) {
     403        DebugStream.printStackTrace(exception);
     404    }
     405
     406    return metadata_values;
     407    }
     408
     409    */
     410
     411    /**
     412     * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
     413     *   - To build a mapping from source file to its corresponding doc.xml file
     414     *   - To get a complete list of all extracted metadata elements
     415     */
     416    /*
     417    public void skimFile()
     418    {
     419    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
     420
     421    // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
     422    DebugStream.println("Skimming " + this + "...");
     423    try {
     424        BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
     425        int description_element_start = -1;
     426
     427        String line = null;
     428        for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
     429        // This line contains the start of a Description element
     430        if (line.indexOf("<Description>") != -1) {
     431            if (description_element_start != -1) {
     432            System.err.println("Parse error: previous Description element unfinished!");
     433            }
     434            description_element_start = line_num;
     435            continue;
     436        }
     437
     438        // This line contains the end of a Description element
     439        if (line.indexOf("</Description>") != -1) {
     440            if (description_element_start == -1) {
     441            System.err.println("Parse error: Description element unstarted!");
     442            }
     443            description_element_start = -1;
     444            continue;
     445        }
     446
     447        // If we're not in a Description element there shouldn't be any Metadata elements
     448        if (description_element_start == -1) {
     449            continue;
     450        }
     451
     452        // This line doesn't contain a Metadata element, so we're not interested
     453        if (line.indexOf("<Metadata ") == -1) {
     454            DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
     455            continue;
     456        }
     457
     458        // Extract the metadata element name
     459        int name_index = line.indexOf(" name=\"") + " name=\"".length();
     460        String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
     461
     462        // If the metadata has a namespace it isn't extracted metadata, so we're not interested
     463        String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
     464        if (!metadata_set_namespace.equals("")) {
     465            continue;
     466        }
     467
     468        // Extracted metadata!
     469        String metadata_element_name = metadata_element_name_full;
     470
     471        // Note which file this doc.xml is for
     472        if (metadata_element_name.equals("gsdlsourcefilename")) {
     473            // Extract the gsdlsourcefilename element value
     474            int value_index = line.indexOf(">", name_index) + ">".length();
     475            String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
     476
     477            // We're only interested in the path relative to the import folder
     478            int import_index = gsdlsourcefilename_value.indexOf("import");
     479            if (import_index != -1) {
     480            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
     481
     482            boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
     483            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
     484
     485            // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
     486            // This is stored in the System's file.encoding property.
     487            gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
     488
     489            // Make sure the path matches the OS that is running
     490            if (is_unix_path && Utility.isWindows()) {
     491                // Convert path from Unix to Windows
     492                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
     493            }
     494            else if (!is_unix_path && !Utility.isWindows()) {
     495                // Convert path from Windows to Unix
     496                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
     497            }
     498
     499            // Remember this for quick access later
     500            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
     501                source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
     502            }
     503
     504            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
     505            }
     506
    250507            // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
    251508            // This is true when the source files come from a zip file processed by ZIPPlug, for example
     
    278535    }
    279536    }
     537    */
     538
    280539}
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java

    r13337 r18362  
    6060
    6161
    62     static public void loadDocXMLFiles(File directory)
     62    static public void loadDocXMLFiles(File directory, String filename_match)
    6363    {
    6464    // Make sure the directory (archives) exists
     
    7272        File child_file = directory_files[i];
    7373        if (child_file.isDirectory()) {
    74         loadDocXMLFiles(child_file);
     74        loadDocXMLFiles(child_file,filename_match);
    7575        }
    76         else if (child_file.getName().equals("doc.xml")) {
    77         loadDocXMLFile(child_file);
     76        else if (child_file.getName().equals(filename_match)) {
     77        // e.g. doc.xml (for regular Greenstone, docmets.xml for Fedora)
     78
     79        loadDocXMLFile(child_file,filename_match);
    7880        }
    7981    }
     
    8183
    8284
    83     static private void loadDocXMLFile(File doc_xml_file_file)
     85    static private void loadDocXMLFile(File doc_xml_file_file,String filename_match)
    8486    {
    85     DocXMLFile doc_xml_file = new DocXMLFile(doc_xml_file_file.getAbsolutePath());
    86     try {
     87    String file = doc_xml_file_file.getAbsolutePath();
     88
     89    // Need to do typecasts in the following to keep Java 1.4 happy
     90    DocXMLFile doc_xml_file
     91        = (filename_match.equals("docmets.xml"))
     92        ? (DocXMLFile) new DocMetsXMLFile(file)
     93        : (DocXMLFile) new DocGAFile(file);
     94
     95    try {       
    8796        doc_xml_file.skimFile();
    8897        doc_xml_files.add(doc_xml_file);
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/MetadataXMLFileManager.java

    r13818 r18362  
    368368
    369369        // Upload the files modified since last time, then reset the list
    370         RemoteGreenstoneServer.uploadCollectionFiles(CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0]));
     370        Gatherer.remoteGreenstoneServer.uploadCollectionFiles(
     371                  CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0]));
    371372        modified_metadata_xml_files.clear();
    372373    }
  • gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/ProfileXMLFile.java

    r13808 r18362  
    110110    // This is inefficient but for simplicity we'll just upload the file every time it is changed
    111111    if (Gatherer.isGsdlRemote) {
    112         RemoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this);
     112        Gatherer.remoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this);
    113113    }
    114114
Note: See TracChangeset for help on using the changeset viewer.