Context Navigation

← Previous Changeset
Next Changeset →

Changeset 5944

Timestamp:

2003-11-24T14:26:35+13:00 (20 years ago)

Author:

cs025

Message:

Index document type, metadata extensions

Location:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes

Files:

: 3 added
: 11 edited

AbstractDocument.java (modified) (10 diffs)
DocumentFactory.java (added)
DocumentInterface.java (modified) (2 diffs)
DocumentList.java (modified) (2 diffs)
DocumentLoader.java (modified) (1 diff)
DocumentSQLWriter.java (modified) (1 diff)
GMLRecogniser.java (modified) (1 diff)
HTMLDocument.java (modified) (1 diff)
HTMLRecogniser.java (modified) (1 diff)
IndexDocument.java (added)
IndexRecogniser.java (added)
RecogniserManager.java (modified) (1 diff)
TextDocument.java (modified) (1 diff)
TextRecogniser.java (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java

-              r5800
+              r5944
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.HashMap;
 import java.util.Map;
+import java.sql.SQLException;
+import java.sql.ResultSet;
 import java.net.URL;
 …
 import org.greenstone.gsdl3.gs3build.util.MultiMap;
+import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection;
 /**
 …
   METSHeader       header;
   DocumentID       id;
+  boolean          isModified;
+  /**
+   *  <p>Create a very vanilla document with a given document identifier.</p>
+   *  <p>Most commonly used in dealing with loading files using DocumentFactory
+   *  or similar.</p>
+   *
+   *  @param <code>DocumentID</code> the document identifier
+   */
+  public AbstractDocument(DocumentID id)
+  { this.fileSet      = new METSFileSet();
+    this.metadata     = new METSDescriptiveSet();
+    this.header       = new METSHeader();
+    this.structureSet = new METSStructureSet();
+    this.id           = id;
+  }
+  /**
+   *  Create a basic document from a given <code>URL</code.  This is usually the form
+   *  called through the recognisers.
+   *
+   *  @param <code>URL</code> the URL of the first file in the document package
+   */
   public AbstractDocument(URL url)
   { this.fileSet      = new METSFileSet();
 …
     this.header       = new METSHeader();
     this.structureSet = new METSStructureSet();
+    this.id           = null;
     METSStructure structure = new METSStructure("All", "All", "Whole Document");
 …
       namespace = GSDL3Namespace.GSDL3_NAMESPACE_ID;
+    }
+    // no need to set isModified, as the following call will do it anyway!
     this.addDocumentMetadata(namespace, name, value);
+  }
 …
   public void addDocumentMetadata(String namespace, String label, String value)
   { this.metadata.addMetadata("default", namespace, label, value);
+  }
+  /**
+   *  @see DocumentInterace:setDocumentMetadata
+    this.isModified = true;
+  }
+  /**
+   *  Post metadata to a file in this document - the appropriate changes
+   *  should be made...
+   */
+  public void postFileMetadata(URL fileLocation, String namespace, String label, String value)
+  {
+    // First get the list of file groups, etc. that this file is associated with...
+    List fileGroups = this.fileSet.findGroups(fileLocation);
+    // Next, get the METS divisions associated with each file group...
+    List divisions = this.structureSet.findDivisionsForFiles(fileGroups);
+    // Finally, post the metadata to the metadata group associated with each structure
+    Iterator divisionIter = divisions.iterator();
+    while (divisionIter.hasNext())
+    { METSDivision division = (METSDivision) divisionIter.next();
+      // get the open namespace for this division
+      METSNamespace namespaceMetadata = division.findNamespace(namespace, true, this.metadata);
+      // then post the metadata to it...
+      namespaceMetadata.addMetadata(label, value);
+    }
+  }
+  /**
+   *  @see DocumentInterface:setDocumentMetadata
    */
   public void setDocumentMetadata(String namespace, String label, String value)
   { this.metadata.setMetadata("default", namespace, label, value);
+    this.isModified = true;
+  }
 …
   /**
+   *  Set the metadata structure for this document
+   *
+   *  @param <code>METSDescriptive</code> the new metadata holder for the document.
+   */
+  public void setDocumentMetadata(METSDescriptiveSet metadata)
+  { this.metadata = metadata;
+    this.isModified = true;
+  }
+  /**
    *  Get the metadata structure of the document
+   *
 …
   public METSStructureSet getDocumentStructure()
   { return this.structureSet;
+  }
+  public void setDocumentStructure(METSStructureSet structureSet)
+  { this.structureSet = structureSet;
+  }
 …
   { return this.fileSet;
+  }
+  public void setDocumentFiles(METSFileSet fileSet)
+  { this.fileSet = fileSet;
+  }
   /**
 …
   { return new DocumentSQLWriter();
+  }
+  /**
+   *  Obtain a document from the SQL database
+   */
+  public static AbstractDocument readSQL(GS3SQLConnection connection, ResultSet sqlResult)
+  { try {
+      DocumentID id = new DocumentID(sqlResult.getString("DocID"));
+      String     type = sqlResult.getString("docType");
+      // Use a factory method to create the correct subtype...
+      AbstractDocument document = DocumentFactory.createDocument(type, id);
+      // Get the individual components of the document
+      METSFileSet fileSet = METSFileSet.readSQL(document, connection);
+      document.setDocumentFiles(fileSet);
+      METSDescriptiveSet descriptiveSet = METSDescriptiveSet.readSQL(document, connection);
+      document.setDocumentMetadata(descriptiveSet);
+      METSStructureSet structureSet = METSStructureSet.readSQL(document, connection);
+      document.setDocumentStructure(structureSet);
+      // indicate that the document is not currently modified
+      document.setModified(false);
+      return document;
+    }
+    catch (SQLException sqlEx) {
+    }
+    return null;
+  }
+  /**
+   *
+   */
+  public boolean isModified()
+  { return this.isModified;
+  }
+  public void setModified(boolean isModified)
+  { this.isModified = isModified;
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentInterface.java

-              r5800
+              r5944
 import java.util.List;
 import java.util.Map;
+import java.net.URL;
 import org.greenstone.gsdl3.gs3build.metadata.*;
 …
 public interface DocumentInterface
+{
+    /**
+     *  Get the immediate document type of the document.  The type may be derived/inherited
+     *  from other DocumentInterface types, but that cannot be checked here.
+     *
+     *  @return <code>String</code> the document type as a string
+     */
+    public String getDocumentType();
+  /**
+   *  Get the immediate document type of the document.  The type may be derived/inherited
+   *  from other DocumentInterface types, but that cannot be checked here.
+   *
+   *  @return <code>String</code> the document type as a string
+   */
+  public String getDocumentType();
+  /**
+   *  Check if this document is of a particular type, or is derived from a particular
+   *  type - i.e. inheritance is considered as well as the immediate type.
+   *
+   *  @param  <code>String</code> the type to check against.
+   *  @return <code>boolean</code> if the document matches the given type.
+   */
+  public boolean isDocumentType(String documentType);
+  public void setID(DocumentID id);
+  public DocumentID getID();
+    /**
+     *  Check if this document is of a particular type, or is derived from a particular
+     *  type - i.e. inheritance is considered as well as the immediate type.
+     *
+     *  @param  <code>String</code> the type to check against.
+     *  @return <code>boolean</code> if the document matches the given type.
+     */
+    public boolean isDocumentType(String documentType);
+  /**
+   *  Get the METS type of the document.
+   *
+   *  @return <code>String</code> the document type as a string
+   */
+  public String getMETSType();
+  public void setHeader(METSHeader header);
+  public METSHeader getHeader();
+    public void setID(DocumentID id);
+    public DocumentID getID();
+  /**
+   *  Whether the document is indexed or not.
+   *
+   *  @return <code>boolean</code> <code>true</code> by default.
+   */
+  public boolean isIndexed();
+    /**
+     *  Get the METS type of the document.
+     *
+     *  @return <code>String</code> the document type as a string
+     */
+    public String getMETSType();
+  /**
+   *  The plain text of the document.
+   *
+   *  @return <code>String</code> This value may be <code>null</code>
+   *          for documents which have no textual component - e.g.
+   *          an image file
+   */
+  public String getDocumentText();
+  /**
+   *  The metadata for the document, encoded as a map.
+   *
+   *  The returned Map may have the following properties:
+   *  1) It may be <code>null</code> - e.g. for plain text documents
+   *  2) Any value in the map may be a <code>List</code> object containing
+   *     more than one possible value - e.g. they key 'Author' may associated
+   *     with a List of several people.
+   *  3) Any value in the map may itself be a <code>Map</code> where the
+   *     encoding scheme permits groupings of hierarchical metadata items.
+   *  4) Where the namespace for a metadata item is known, its key will
+   *     include the namespace.  Hence Dublin Core 'Author' would be encoded
+   *     as "dc.Author" as the key.
+   *
+   *  @return <code>METSDescriptive</code> the metadata of the document
+   */
+  public METSDescriptiveSet getDocumentMetadata();
+  /**
+   *  The metadata for a particular given upon the document, encoded as a <code>List</code>
+   *
+   *  @param <code>String</code> the namespace of the metadata
+   *  @param <code>String</code> the label of the values to obtain
+   *
+   *  @return <code>List</code> the metadata values
+   */
+  public List getDocumentMetadataItem(String namespace, String label);
+  /**
+   *  The metadata for the document, encoded as a map.
+   *
+   *  @param <code>String</code> the namespace and label of the metadata, separated
+   *                             by a colon.  If no namespace is given, it is
+   *                             defaulted
+   *
+   *  @return <code>List</code> the metadata values
+   */
+  public List getDocumentMetadataItem(String namespaceLabel);
+  /**
+   *  Facilitate the decoration of a document with external or extracted
+   *  metadata.  This is a "cheap" form which doesn't have a separate
+   *  namespace element.  Either the data is to be stored in the "open"
+   *  Greenstone metadata namespace, or the namespace is encoded within
+   *  the label.
+   *
+   *  @param <code>String</code> label of the metadata, with a '.' to deliminate
+   *                             sub-component structures.  The label may commence
+   *                             with a namespace followed by a colon.
+   *                             <p>e.g. "dc:title" for Dublin Core Title.</p>
+   *  @param <code>String</code> value of the metadata
+   */
+  public void addDocumentMetadata(String label, String value);
+  /**
+   *  Facilitate the decoration of a document with external or extracted
+   *  metadata.
+   *
+   *  @param <code>String</code> namespace of the metadata
+   *  @param <code>String</code> label of the metadata, with a '.' to deliminate
+   *                             sub-component structures
+   *  @param <code>String</code> value of the metadata
+   */
+  public void addDocumentMetadata(String namespace, String label, String value);
+    public void setHeader(METSHeader header);
+    public METSHeader getHeader();
+  /**
+   *  Post metadata to a file in this document - the appropriate changes
+   *  should be made...
+   *
+   *  @param <code>URL</code> the location of the file...
+   *  @param <code>String</code> the namespace of the metadata
+   *  @param <code>String</code> label of the metadata, with a '.' to deliminate
+   *                             sub-component structures
+   *  @param <code>String</code> value of the metadata
+   */
+  public void postFileMetadata(URL fileLocation, String namespace, String label, String value);
+    /**
+     *  Whether the document is indexed or not.
+     *
+     *  @return <code>boolean</code> <code>true</code> by default.
+     */
+    public boolean isIndexed();
+  /**
+   *  The constituent files for the document: each should be wrapped up
+   *  as a URL
+   *
+   *  @return <code>METSFileSet</code> the files which constitute the document
+   */
+  public METSFileSet getDocumentFiles();
+    /**
+     *  The plain text of the document.
+     *
+     *  @return <code>String</code> This value may be <code>null</code>
+     *          for documents which have no textual component - e.g.
+     *          an image file
+     */
+    public String getDocumentText();
+    /**
+     *  The metadata for the document, encoded as a map.
+     *
+     *  The returned Map may have the following properties:
+     *  1) It may be <code>null</code> - e.g. for plain text documents
+     *  2) Any value in the map may be a <code>List</code> object containing
+     *     more than one possible value - e.g. they key 'Author' may associated
+     *     with a List of several people.
+     *  3) Any value in the map may itself be a <code>Map</code> where the
+     *     encoding scheme permits groupings of hierarchical metadata items.
+     *  4) Where the namespace for a metadata item is known, its key will
+     *     include the namespace.  Hence Dublin Core 'Author' would be encoded
+     *     as "dc.Author" as the key.
+     *
+     *  @return <code>METSDescriptive</code> the metadata of the document
+     */
+    public METSDescriptiveSet getDocumentMetadata();
+  /**
+   *  Set the constituent files for the document.
+   */
+  public void setDocumentFiles(METSFileSet fileSet);
+    /**
+     *  The metadata for the document, encoded as a map.
+     *
+     *  @param <code>String</code> the namespace of the metadata
+     *  @param <code>String</code> the label of the values to obtain
+     *
+     *  @return <code>List</code> the metadata values
+     */
+    public List getDocumentMetadataItem(String namespace, String label);
+  /**
+   *  Obtain the structural information on the document
+   *
+   *  @return <code>METSStructureSet</code> the structural information on the
+   *          document.
+   */
+  public METSStructureSet getDocumentStructure();
+  /**
+   *  Indicate whether the document can be stored in a native form in
+   *  a METS wrapper
+   *
+   *  @return <code>boolean</code> <code>true</code> if the document
+   *          is just to be wrapped in a METS shell.
+   */
+  public boolean isMETSCompatible();
+  /**
+   *  Write the document into a METS wrapper - for many document types,
+   *  this will not actually be done by the document itself, but rather
+   *  by the default writer
+   */
+  public DocumentWriter getMETSWriter();
+  /**
+   *  Get the writer to send the document to an SQL database
+   */
+  public DocumentSQLWriter getSQLWriter();
+    /**
+     *  The metadata for the document, encoded as a map.
+     *
+     *  @param <code>String</code> the namespace and label of the metadata, separated
+     *                             by a colon.  If no namespace is given, it is
+     *                             defaulted
+     *
+     *  @return <code>List</code> the metadata values
+     */
+    public List getDocumentMetadataItem(String namespaceLabel);
+  /**
+   *  Check if the document is changed or not
+   */
+  public boolean isModified();
+    /**
+     *  Facilitate the decoration of a document with external or extracted
+     *  metadata.  This is a "cheap" form which doesn't have a separate
+     *  namespace element.  Either the data is to be stored in the "open"
+     *  Greenstone metadata namespace, or the namespace is encoded within
+     *  the label.
+     *
+     *  @param <code>String</code> label of the metadata, with a '.' to deliminate
+     *                             sub-component structures.  The label may commence
+     *                             with a namespace followed by a colon.
+     *                             <p>e.g. "dc:title" for Dublin Core Title.</p>
+     *  @param <code>String</code> value of the metadata
+     */
+    public void addDocumentMetadata(String label, String value);
+    /**
+     *  Facilitate the decoration of a document with external or extracted
+     *  metadata.
+     *
+     *  @param <code>String</code> namespace of the metadata
+     *  @param <code>String</code> label of the metadata, with a '.' to deliminate
+     *                             sub-component structures
+     *  @param <code>String</code> value of the metadata
+     */
+    public void addDocumentMetadata(String namespace, String label, String value);
+    /**
+     *  The constituent files for the document: each should be wrapped up
+     *  as a URL
+     *
+     *  @return <code>METSFileSet</code> the files which constitute the document
+     */
+    public METSFileSet getDocumentFiles();
+    /**
+     *  Obtain the structural information on the document
+     *
+     *  @return <code>METSStructureSet</code> the structural information on the
+     *          document.
+     */
+    public METSStructureSet getDocumentStructure();
+    /**
+     *  Indicate whether the document can be stored in a native form in
+     *  a METS wrapper
+     *
+     *  @return <code>boolean</code> <code>true</code> if the document
+     *          is just to be wrapped in a METS shell.
+     */
+    public boolean isMETSCompatible();
+    /**
+     *  Write the document into a METS wrapper - for many document types,
+     *  this will not actually be done by the document itself, but rather
+     *  by the default writer
+     */
+    public DocumentWriter getMETSWriter();
+    public DocumentSQLWriter getSQLWriter();
+  /**
+   *  Set the document modified state
+   */
+  public void setModified(boolean isModified);
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentList.java

-              r5800
+              r5944
 package org.greenstone.gsdl3.gs3build.doctypes;
+import java.util.Iterator;
 import java.util.List;
 import java.util.ArrayList;
 …
 import java.io.IOException;
+import java.net.URL;
+import java.sql.SQLException;
+import java.sql.ResultSet;
 import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection;
+import org.greenstone.gsdl3.gs3build.database.GS3SQLSelect;
+import org.greenstone.gsdl3.gs3build.database.GS3SQLWhereItem;
+import org.greenstone.gsdl3.gs3build.database.GS3SQLWhere;
+import org.greenstone.gsdl3.gs3build.database.GS3SQLField;
 public class DocumentList
+{
+    DocumentInterface []       list;
+    int                        size;
+    int                        used;
+    DocumentIDFactoryInterface idFactory;
+  DocumentInterface []       list;       // what is currently cached
+  int                        size;       // the maximum number in the cache
+  int                        used;       // the actual number in the cache
+  int                        count;      // the total number of known documents
+  DocumentIDFactoryInterface idFactory;  // A manufacturer of novel document IDs
+  GS3SQLConnection           connection; // used to query the SQL database
+  private static final int maxSize = 10;
+  public DocumentList(GS3SQLConnection connection)
+  { this.idFactory = null;
+    this.list  = new DocumentInterface[10];
+    this.used  = 0;
+    this.size  = 10;
+    this.count = 0;
+    this.connection = connection;
+  }
+  public DocumentList(DocumentIDFactoryInterface idFactory, GS3SQLConnection connection)
+  { this.idFactory = idFactory;
+    this.list  = new DocumentInterface[10];
+    this.used  = 0;
+    this.size  = 10;
+    this.count = 0;
+    this.connection = connection;
+  }
+  /**
+   *  Write the document into the document list (cache) and the database.
+   *
+   *  @param <code>DocumentInterface</code> the document itself
+   */
+  public void addDocument(DocumentInterface document)
+  { // increase cache size, etc. as necessary
+    if (this.used == this.size) {
+      if (this.size >= maxSize) {
+    for (int i = 0; i < this.size - 1; i ++) {
+      this.list[i] = this.list[i+1];
+    }
+    this.used --;
+      }
+      else {
+    this.ensureSize((this.size * 2) > maxSize ? maxSize : (this.size * 2));
+      }
+    }
+    // insert the document itself
+    this.list[this.used] = document;
+    // set the document identifier, if not already set
+    if (document.getID() == null) {
+      DocumentID id = this.idFactory.getNewDocumentID(document);
+      document.setID(id);
+    }
+    // add to the database as well
+    document.getSQLWriter().writeDocument(document, this.connection);
+    // Remember that we've used one more item from the cache.
+    this.used ++;
+    // Note additional document
+    this.count ++;
+  }
+  /**
+   *  Note that an individual document is modified, and act accordingly
+   *
+   *  @param <code>DocumentInterface</code> the document
+   */
+  public void modifiedDocument(DocumentInterface document)
+  { document.getSQLWriter().writeDocument(document, this.connection);
+  }
+  /**
+   *  Get an iterator across all the documents, not merely those in
+   *  the cache.  Note that this <code>Iterator</code> does <b>not</b>
+   *  support the <code>remove()</code> function, and will raise an
+   *  <code>UnsupportedOperationException</code> if you attempt to do
+   *  so.
+   *
+   *  @return <code>Iterator</code> the iterator across the documents.
+   */
+  public Iterator iterator()
+  { return new DocumentListIterator(connection);
+  }
+  /**
+   *  Get the nth member of the <b>cached</b> document list.
+   *
+   *  @deprecated
+   */
+  public DocumentInterface getDocument(int index)
+  { if (index < 0 || index >= this.used)
+    { return null;
+    }
+    return this.list[index];
+  }
+  /**
+  public DocumentID getDocumentID(int index)
+  { if (index < 0 || index >= this.used)
+    { return null;
+    }
+    return this.list[index].getID();
+  }
+  */
+  protected void ensureSize(int size)
+  { DocumentInterface [] newList = new DocumentInterface[size];
+    System.arraycopy(this.list, 0, newList, 0, this.size);
+    this.list = newList;
+    this.size = size;
+  }
+  public void writeDocuments(File directory)
+  { Iterator documents = this.iterator();
+    int item = 0;
+    while (documents.hasNext())
+    { DocumentInterface document = (DocumentInterface) documents.next();
+      try
+      { item ++;
+    File localFile = new File(directory, "Doc"+Integer.toString(item)+".xml");
+    FileWriter fileWriter = new FileWriter(localFile);
+    PrintWriter writer = new PrintWriter(fileWriter);
+    document.getMETSWriter().writeDocument(document, writer);
+    writer.close();
+    fileWriter.close();
+      }
+      catch (IOException io)
+      {
+      }
+    }
+  }
+  public void writeSQLDocuments(GS3SQLConnection connection)
+  { for (int i = 0; i < this.used; i ++)
+    { this.list[i].getSQLWriter().writeDocument(this.list[i], connection);
+    }
+  }
+  public static DocumentList readSQLDocuments(GS3SQLConnection connection)
+  { DocumentList list = new DocumentList(connection);
+    GS3SQLSelect select = new GS3SQLSelect("document");
+    select.addField("*");
+    ResultSet documents;
+    try {
+      connection.execute(select.toString());
+      documents = connection.getResultSet();
+      if (documents.first())
+      { do
+    { DocumentInterface document = AbstractDocument.readSQL(connection, documents);
+          list.addDocument(document);
+    }
+    while (documents.next());
+      }
+    }
+    catch (java.sql.SQLException ex)
+    { System.out.println(ex);
+      return null;
+    }
+    return list;
+  }
+  public int getCount()
+  { return this.count;
+  }
+  public int size()
+  { return this.used;
+  }
+}
+class DocumentListIterator implements Iterator
+{
+  private boolean hasNext;
+  private ResultSet resultSet;
+  private GS3SQLConnection connection;
+  public DocumentListIterator(GS3SQLConnection connection)
+  {
+    this.connection = connection;
+    GS3SQLSelect select = new GS3SQLSelect("document");
+    select.addField("*");
+    try {
+      connection.execute(select.toString());
+      this.resultSet = connection.getResultSet();
+      this.hasNext = this.resultSet.first();
+    } catch (SQLException ex) {
+      this.hasNext = false;
+    }
+  }
+  public boolean hasNext()
+  { return this.hasNext;
+  }
+  public Object next()
+  {
+    // get the 'next' document first
+    DocumentInterface document = AbstractDocument.readSQL(connection, this.resultSet);
+    // now actually step forward to the next item, so that we know if we have one!
+    try {
+      this.hasNext = this.resultSet.next();
+      if (!this.hasNext) {
+    this.resultSet.close(); // be a good citizen & close used result sets
+      }
+    } catch (SQLException ex) {
+      this.hasNext = false;
+    }
+    return document;
+  }
+  public void remove() throws UnsupportedOperationException
+  { throw new UnsupportedOperationException("DocumentList does not support iterator removal of documents");
+  }
+  public List getDocumentIdsWithFile(URL fileLocation)
+  { List reply = new ArrayList();
+    GS3SQLSelect select = new GS3SQLSelect("files");
+    select.addField("*");
+    GS3SQLWhere  where  = new GS3SQLWhere(new GS3SQLWhereItem("FileLocation", "=", fileLocation.toString()));
+    select.setWhere(where);
+    this.connection.execute(select.toString());
+    ResultSet results = this.connection.getResultSet();
+    if (results != null) {
+      select = new GS3SQLSelect("filegroups");
+      select.addField("DocID");
+      select.setDistinct(true);
+      where = new GS3SQLWhere();
+      where.setCondition(GS3SQLWhere.OR_CONDITION);
+      GS3SQLWhereItem whereItem = null;
+      try {
+    results.first();
+    do {
+      int fileGroupRef = results.getInt("FileGroupRef");
+      whereItem = new GS3SQLWhereItem("FileGroupRef", "=", Integer.toString(fileGroupRef), GS3SQLField.INTEGER_TYPE);
+      where.add(whereItem);
+    }
+    while (results.next());
+    select.setWhere(where);
+    results.close();
+    public DocumentList(DocumentIDFactoryInterface idFactory)
+    {   this.idFactory = idFactory;
+        this.list = new DocumentInterface[10];
+        this.used = 0;
+        this.size = 10;
+    }
+    public void addDocument(DocumentInterface document)
+    {   if (this.used == this.size) {
+            this.ensureSize(this.size * 2);
+        }
+        this.list[this.used] = document;
+        DocumentID id = this.idFactory.getNewDocumentID(document);
+        document.setID(id);
+        this.used ++;
+    }
+    public DocumentInterface getDocument(int index)
+    {   if (index < 0 || index >= this.used)
+        {   return null;
+        }
+        return this.list[index];
+    }
+    public DocumentID getDocumentID(int index)
+    {   if (index < 0 || index >= this.used)
+        {   return null;
+        }
+        return this.list[index].getID();
+    }
+    public void ensureSize(int size)
+    {   DocumentInterface [] newList = new DocumentInterface[size];
+        System.arraycopy(this.list, 0, newList, 0, this.size);
+        this.list = newList;
+        this.size = size;
+    }
+    public void writeDocuments(File directory)
+    {   for (int i = 0; i < this.used; i ++)
+        {   try
+            {
+                File localFile = new File(directory, "Doc"+Integer.toString(i)+".xml");
+                FileWriter fileWriter = new FileWriter(localFile);
+                PrintWriter writer = new PrintWriter(fileWriter);
+                this.list[i].getMETSWriter().writeDocument(this.list[i], writer);
+                writer.close();
+                fileWriter.close();
+            }
+            catch (IOException io)
+            {
+            }
+        }
+    }
+    public void writeSQLDocuments(GS3SQLConnection connection)
+    {   for (int i = 0; i < this.used; i ++)
+        {   this.list[i].getSQLWriter().writeDocument(this.list[i], connection);
+        }
+    }
+    public int size()
+    {   return this.used;
+    }
+    this.connection.execute(select.toString());
+    results = this.connection.getResultSet();
+    results.first();
+    do {
+      String docId = results.getString("DocID");
+      reply.add(docId);
+    } while (results.next());
+      }
+      catch (SQLException sqlEx)
+      { System.err.println(sqlEx);
+      }
+    }
+    return reply;
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentLoader.java

-              r5800
+              r5944
 public class DocumentLoader
+{
     public static String getAsString(InputStream in)
     {   StringBuffer reply;
         byte    data[] = new byte[128];
         int     databytes;
+  public static String getAsString(InputStream in)
+  { StringBuffer reply;
+    byte    data[] = new byte[1024];
+    int     databytes;
+        reply   = new StringBuffer();
+    reply   = new StringBuffer();
+    try
+    {
+      do
+      { databytes = in.read(data);
+        if (databytes > 0)
+    { reply.append(new String(data, 0, databytes));
+    }
+      } while (databytes >= 0);
+    }
+    catch (IOException io)
+    {
+    }
+    return reply.toString();
+  }
+        try
+        {
+            do
+            { databytes = in.read(data);
+                if (databytes > 0)
+                {   reply.append(new String(data, 0, databytes));
+                }
+            } while (databytes >= 0);
+        }
+        catch (IOException io)
+        {
+        }
+  public static String getAsString(File file)
+  { FileInputStream in;
+    String          reply = null;
+        return reply.toString();
+    try
+    { in    = new FileInputStream(file);
+      if (in == null)
+    {   return null;
+    }
+      reply = getAsString(in);
+      in.close();
+    }
+    catch (IOException io)
+    { return null;
+    }
+    return reply;
+  }
+    public static String getAsString(File file)
+    { FileInputStream in;
+        String                  reply = null;
+        try
+        {   in  = new FileInputStream(file);
+            if (in == null)
+            {   return null;
+            }
+            reply = getAsString(in);
+            in.close();
+        }
+        catch (IOException io)
+        {   return null;
+        }
+        return reply;
+    }
+    public static String getAsString(URL url)
+    {   if (url.toString().startsWith("file://"))
+        {   File file = new File(url.toString().substring(7));
+            return getAsString(file);
+        }
+        return null;
+    }
+  public static String getAsString(URL url)
+  { if (url.toString().startsWith("file://"))
+    { File file = new File(url.toString().substring(7));
+      return getAsString(file);
+    }
+    else if (url.toString().startsWith("file:/"))
+    { File file = new File(url.toString().substring(5));
+      return getAsString(file);
+    }
+    return null;
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentSQLWriter.java

-              r5800
+              r5944
     // put the document into the database
     try {
     if (document.getID() != null)
+      if (document.getID() != null)
       { //tag = XMLTools.addAttribute(tag, "OBJID", document.getID().toString());
+      GS3SQLInsert insert = new GS3SQLInsert("document");
+    GS3SQLSelect select = new GS3SQLSelect("document");
+    select.addField("*");
+    select.setWhere(new GS3SQLWhere(new GS3SQLWhereItem("DocID", "=", document.getID().toString())));
+    connection.execute(select.toString());
+    ResultSet results = connection.getResultSet();
+    if (results == null ||
+        !results.first())
+    { GS3SQLInsert insert = new GS3SQLInsert("document");
       insert.addValue("DocID", document.getID().toString());
+      insert.addValue("DocType", document.getDocumentType());
-      System.out.println(insert.toString());
       connection.execute(insert.toString());
+    }
+      }
     } catch (Exception ex) {

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/GMLRecogniser.java

-              r5800
+              r5944
 public class GMLRecogniser implements RecogniserInterface
+{
     DocumentList listRepository;
+  DocumentList listRepository;
     public GMLRecogniser(DocumentList listRepository)
     {   this.listRepository = listRepository;
+    }
+  public GMLRecogniser(DocumentList listRepository)
+  { this.listRepository = listRepository;
+  }
     public boolean parseDocument(METSFile file)
+    {
         String MIMEType = file.getMIMEType();
         if (MIMEType == null ||
               MIMEType.equals("text/xml")) {
             URL location = file.getLocation();
             return this.parseDocument(location);
+        }
         return false;
+    }
+  public boolean parseDocument(METSFile file)
+  {
+    String MIMEType = file.getMIMEType();
+    if (MIMEType == null ||
+    MIMEType.equals("text/xml")) {
+      URL location = file.getLocation();
+      return this.parseDocument(location);
+    }
+    return false;
+  }
     public boolean parseDocument(URL url)
     {   if (url.toString().startsWith("file://")) {
             String fileName = url.toString().substring(7);
             if (fileName.endsWith(".gml"))
+          {
                 System.out.println("Posting GML Document " + fileName);
             GMLDocument doc = new GMLDocument(url);
             this.listRepository.addDocument(doc);
               // TODO: spawn knowledge of children too...
 //              System.out.println(doc.getDocumentText());
               return true;
+            }
+        }
         else {
             // TODO: get Mime type remotely, and then proceed if required
+        }
         return false;
+    }
+  public boolean parseDocument(URL url)
+  { if (url.toString().startsWith("file://")) {
+      String fileName = url.toString().substring(7);
+      if (fileName.endsWith(".gml"))
+      {
+    System.out.println("Posting GML Document " + fileName);
+    GMLDocument doc = new GMLDocument(url);
+    this.listRepository.addDocument(doc);
+    // TODO: spawn knowledge of children too...
+    //              System.out.println(doc.getDocumentText());
+    return true;
+      }
+    }
+    else {
+      // TODO: get Mime type remotely, and then proceed if required
+    }
+    return false;
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java

-              r5800
+              r5944
 public class HTMLDocument extends AbstractDocument
+{
     public static final String HTML_DOCUMENT_TYPE = "HTML";
+  public static final String HTML_DOCUMENT_TYPE = "HTML";
+    /**
+     *  Create the HTMLDocument from a given URL - the URL may in fact be a reference
+     *  to a local file.
+     *
+     *  @param <code>URL</code> The location from which to load the file
+     */
+    public HTMLDocument(URL url)
+    {   super(url);
+  public HTMLDocument(DocumentID id)
+  { super(id);
+  }
+        HTMLDoc htmlDoc;
+        if (url.toString().startsWith("file://"))
+        {   htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+        }
+        else
+        {   htmlDoc = new HTMLDoc(url);
+        }
+  /**
+   *  Create the HTMLDocument from a given URL - the URL may in fact be a reference
+   *  to a local file.
+   *
+   *  @param <code>URL</code> The location from which to load the file
+   */
+  public HTMLDocument(URL url)
+  { super(url);
+        this._extractDocumentFiles(htmlDoc);
+        this._extractDocumentMetadata(htmlDoc);
+    HTMLDoc htmlDoc;
+    if (url.toString().startsWith("file://"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+    }
+    else if (url.toString().startsWith("file:/"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
+    }
+    else
+    { htmlDoc = new HTMLDoc(url);
+    }
+    this._extractDocumentFiles(htmlDoc);
+    this._extractDocumentMetadata(htmlDoc);
+  }
+  private void _extractDocumentMetadata(HTMLDoc htmlDoc)
+  { HTMLBlock codedContent = htmlDoc.getCodedContent();
+    boolean inTitle = false;
+    StringBuffer title = new StringBuffer();
+    for (int e = 0; e < codedContent.size(); e ++)
+    { if (codedContent.elementAt(e) instanceof HTMLTag)
+      { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+        if (tag.tagName().equals("meta"))
+    { // check that the name of the metadata item exists
+      String name = tag.idValue("name");
+      if (name == null || name.length() == 0) {
+        continue;
+      }
+      // get the value, if it exists
+      String value = tag.idValue("content");
+      if (value != null && value.length() > 0) {
+        System.out.println("  " + value);
+      }
+      // if value does not exist, default it to being the same
+      // as the name.
+      else {
+        value = name;
+      }
+      this.addDocumentMetadata(name, value);
+    }
+    else if (tag.tagName().equals("title"))
+    { inTitle = true;
+    }
+    else if (tag.tagName().equals("/title"))
+    { inTitle = false;
+    }
+    // cut off when real body content appears - not a perfect
+        // implementation, just cheap & cheerful
+    else if (tag.tagName().equals("/head"))
+    { break;
+    }
+    else if (tag.tagName().equals("body"))
+    { break;
+    }
+      }
+      else if (inTitle == true)
+      { title.append(codedContent.elementAt(e).toString());
+      }
+    }
+    if (title.length() > 0)
+    { this.addDocumentMetadata("title", title.toString());
+    }
+  }
+    private void _extractDocumentMetadata(HTMLDoc htmlDoc)
+    {   HTMLBlock codedContent = htmlDoc.getCodedContent();
+        boolean inTitle = false;
+        StringBuffer title = new StringBuffer();
+  private void _extractDocumentFiles(HTMLDoc htmlDoc)
+  { URL homeUrl = this.fileSet.getFile(0).getLocation();
+        for (int e = 0; e < codedContent.size(); e ++)
+        {   if (codedContent.elementAt(e) instanceof HTMLTag)
+            {   HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+    HTMLBlock codedContent = htmlDoc.getCodedContent();
+    for (int e = 0; e < codedContent.size(); e ++)
+    { if (codedContent.elementAt(e) instanceof HTMLTag)
+      { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+                if (tag.tagName().equals("meta"))
+                {   // check that the name of the metadata item exists
+                    String name = tag.idValue("name");
+                    if (name == null || name.length() == 0) {
+                        continue;
+                    }
+        if (tag.tagName().equals("img"))
+    { String location = tag.idValue("src");
+                    // get the value, if it exists
+                    String value = tag.idValue("content");
+                    if (value != null && value.length() > 0) {
+                        System.out.println("  " + value);
+                    }
+                    // if value does not exist, default it to being the same
+                    // as the name.
+                    else {
+                        value = name;
+                    }
+                    this.addDocumentMetadata(name, value);
+                }
+                else if (tag.tagName().equals("title"))
+                { inTitle = true;
+                }
+                else if (tag.tagName().equals("/title"))
+                {   inTitle = false;
+                }
+                // cut off when real body content appears - not a perfect
+                // implementation, just cheap & cheerful
+                else if (tag.tagName().equals("/head"))
+                {   break;
+                }
+                else if (tag.tagName().equals("body"))
+                {   break;
+                }
+            }
+            else if (inTitle == true)
+            { title.append(codedContent.elementAt(e).toString());
+            }
+        }
+        if (title.length() > 0)
+        { this.addDocumentMetadata("title", title.toString());
+        }
+    }
+    private void _extractDocumentFiles(HTMLDoc htmlDoc)
+    { URL homeUrl = this.fileSet.getFile(0).getLocation();
+      HTMLBlock codedContent = htmlDoc.getCodedContent();
+      for (int e = 0; e < codedContent.size(); e ++)
+      { if (codedContent.elementAt(e) instanceof HTMLTag)
+    { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+      if (tag.tagName().equals("img"))
+      { String location = tag.idValue("src");
+        try
+        { // make the url for the image, and then add it to the document list of
+          //
+          URL imgUrl = new URL(homeUrl, location);
+          METSFile file = this.fileSet.addFile(imgUrl);
+          this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
+        }
+        catch (MalformedURLException ex)
+        { // TODO: report exception/failure to resolve...
+        }
+      try
+      { // make the url for the image, and then add it to the document list of
+        //
+        URL imgUrl = new URL(homeUrl, location);
+        METSFile file = this.fileSet.addFile(imgUrl);
+        this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
+      }
+      catch (MalformedURLException ex)
+      { // TODO: report exception/failure to resolve...
+      }
+    }
+      }
+    }
+  }
+    public String getDocumentType()
+    {   return HTML_DOCUMENT_TYPE;
+  public String getDocumentType()
+  { return HTML_DOCUMENT_TYPE;
+  }
+  public String getDocumentText()
+  {
+    HTMLDoc htmlDoc;
+    URL     url =(URL) this.fileSet.getFile(0).getLocation();
+    if (url.toString().startsWith("file://"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+    }
+    public String getDocumentText()
+    { HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7));
+      return htmlDoc.getContent();
+    else if (url.toString().startsWith("file:/"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
+    }
+    else
+    { htmlDoc = new HTMLDoc(url);
+    }
+    return htmlDoc.getContent();
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLRecogniser.java

-              r5800
+              r5944
 public class HTMLRecogniser implements RecogniserInterface
+{
     DocumentList listRepository;
+  DocumentList listRepository;
+    public HTMLRecogniser(DocumentList listRepository)
+    {   this.listRepository = listRepository;
+    }
+  public HTMLRecogniser(DocumentList listRepository)
+  { this.listRepository = listRepository;
+  }
+  public boolean parseDocument(METSFile file)
+  {
+    String MIMEType = file.getMIMEType();
+    if (MIMEType == null ||
+    MIMEType.equals("text/html")) {
+      URL location = file.getLocation();
+      return this.parseDocument(location);
+    }
+    return false;
+  }
+    public boolean parseDocument(METSFile file)
+    {
+        String MIMEType = file.getMIMEType();
+        if (MIMEType == null ||
+              MIMEType.equals("text/html")) {
+            URL location = file.getLocation();
+            return this.parseDocument(location);
+        }
+        return false;
+    }
+  public boolean parseDocument(URL url)
+  { String fileName = null;
+    public boolean parseDocument(URL url)
+    {   if (url.toString().startsWith("file://")) {
+            String fileName = url.toString().substring(7);
+            if (fileName.endsWith(".htm") ||
+                  fileName.endsWith(".html"))
+          { System.out.println("Posting HTML Document " + fileName);
+    if (url.toString().startsWith("file://")) {
+      fileName = url.toString().substring(7);
+    }
+    else if (url.toString().startsWith("file:/")) {
+      fileName = url.toString().substring(5);
+    }
+            HTMLDocument doc = new HTMLDocument(url);
+            this.listRepository.addDocument(doc);
+              return true;
+            }
+        }
+        else {
+            // Get Mime type remotely, and then proceed if required
+            String mimeType = HTTPTools.getMIMEType(url);
+    if (fileName != null) {
+      if (fileName.endsWith(".htm") ||
+      fileName.endsWith(".html"))
+      { System.out.println("Posting HTML Document " + fileName);
+            if (mimeType == "text/html")
+            {   System.out.println("Posting HTML Document " + url.toString());
+        HTMLDocument doc = new HTMLDocument(url);
+    this.listRepository.addDocument(doc);
+    return true;
+      }
+    }
+    else {
+      // Get Mime type remotely, and then proceed if required
+      String mimeType = HTTPTools.getMIMEType(url);
+      if (mimeType == "text/html")
+      { System.out.println("Posting HTML Document " + url.toString());
                 HTMLDocument doc = new HTMLDocument(url);
                 this.listRepository.addDocument(doc);
                 return true;
+            }
+        }
         return false;
+    }
+        HTMLDocument doc = new HTMLDocument(url);
+    this.listRepository.addDocument(doc);
+    return true;
+      }
+    }
+    return false;
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/RecogniserManager.java

-              r5800
+              r5944
 public class RecogniserManager implements FileCrawlObserver
+{
     RecogniserInterface list[];
     int                 used;
     int                 size;
+  RecogniserInterface list[];
+  int                 used;
+  int                 size;
     public RecogniserManager()
     {   this.list = new RecogniserInterface[10];
         this.used = 0;
         this.size = 10;
+    }
+  public RecogniserManager()
+  { this.list = new RecogniserInterface[10];
+    this.used = 0;
+    this.size = 10;
+  }
     public void addRecogniser(RecogniserInterface recogniser)
     {   this.ensureCapacity(this.used + 1);
+  public void addRecogniser(RecogniserInterface recogniser)
+  { this.ensureCapacity(this.used + 1);
         this.list[this.used] = recogniser;
         this.used ++;
+    }
+    this.list[this.used] = recogniser;
+    this.used ++;
+  }
     public void processFile(URL url)
     {   boolean result;
+  public void processFile(URL url)
+  { boolean result;
         for (int r = 0; r < this.used; r ++)
         {   if (list[r].parseDocument(url)) {
                 break;
+            }
+        }
+    }
+    for (int r = 0; r < this.used; r ++)
+    { if (list[r].parseDocument(url)) {
+        break;
+      }
+    }
+  }
+    public void processFile(File file)
+    {   try {
+            URL url = new URL("file://"+file.toString());
+  public void processFile(File file)
+  { try {
+      URL url = new URL("file://"+file.toString());
+      this.processFile(url);
+    }
+    catch (java.net.MalformedURLException ex)
+    {
+      System.out.println(ex);
+    }
+  }
+            this.processFile(url);
+        }
+        catch (java.net.MalformedURLException ex)
+        {
+            System.out.println(ex);
+        }
+    }
+    private void ensureCapacity(int size)
+    {   while (size >= this.size)
+        {   RecogniserInterface newList [] = new RecogniserInterface[this.size*2];
+            this.size *= 2;
+            System.arraycopy(this.list, 0, newList, 0, this.size);
+            this.list = newList;
+        }
+    }
+  private void ensureCapacity(int size)
+  { while (size >= this.size)
+    { RecogniserInterface newList [] = new RecogniserInterface[this.size*2];
+      this.size *= 2;
+      System.arraycopy(this.list, 0, newList, 0, this.size);
+      this.list = newList;
+    }
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/TextDocument.java

-              r5800
+              r5944
 public class TextDocument extends AbstractDocument
+{
     public static final String TEXT_DOCUMENT_TYPE = "Text";
+  public static final String TEXT_DOCUMENT_TYPE = "Text";
     public TextDocument(URL url)
     {   super(url);
+    }
+  public TextDocument(DocumentID id)
+  { super(id);
+  }
     public String getDocumentType()
     {   return TEXT_DOCUMENT_TYPE;
+    }
+  public TextDocument(URL url)
+  { super(url);
+  }
+    /**
+     *  A pretty minimal and lazy document text extraction process.
+     */
+    public String getDocumentText()
+    { return DocumentLoader.getAsString((URL) this.fileSet.getFile(0).getLocation());
+    }
+  public String getDocumentType()
+  { return TEXT_DOCUMENT_TYPE;
+  }
+  /**
+   *  A pretty minimal and lazy document text extraction process.
+   */
+  public String getDocumentText()
+  { return DocumentLoader.getAsString((URL) this.fileSet.getFile(0).getLocation());
+  }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/TextRecogniser.java

-              r5800
+              r5944
 import org.greenstone.gsdl3.gs3build.metadata.*;
+import org.greenstone.gsdl3.gs3build.util.HTTPTools;
 public class TextRecogniser implements RecogniserInterface
+{
+    DocumentList listRepository;
+  DocumentList listRepository;
+  public TextRecogniser(DocumentList listRepository)
+  { this.listRepository = listRepository;
+  }
+    public TextRecogniser(DocumentList listRepository)
+    {   this.listRepository = listRepository;
+    }
+  public boolean parseDocument(METSFile file)
+  {
+    String MIMEType = file.getMIMEType();
+    if (MIMEType == null ||
+    MIMEType.equals("text/plain")) {
+      URL location = file.getLocation();
+      return this.parseDocument(location);
+    }
+    return false;
+  }
+    public boolean parseDocument(METSFile file)
+    {
+        String MIMEType = file.getMIMEType();
+        if (MIMEType == null ||
+              MIMEType.equals("text/plain")) {
+            URL location = file.getLocation();
+            return this.parseDocument(location);
+        }
+        return false;
+    }
+  public boolean parseDocument(URL url)
+  { String fileName = null;
+    public boolean parseDocument(URL url)
+    {   if (url.toString().startsWith("file://")) {
+            String fileName = url.toString().substring(7);
+    if (url.toString().startsWith("file://")) {
+      fileName = url.toString().substring(7);
+    }
+    else if (url.toString().startsWith("file:/")) {
+      fileName = url.toString().substring(5);
+    }
+            if (fileName.endsWith(".txt") ||
+              fileName.endsWith(".text"))
+            {   this.listRepository.addDocument(new TextDocument(url));
+                // TODO: spawn knowledge of children too...
+                System.out.println(">>> Posting text document " + fileName);
+                return true;
+            }
+        }
+        else
+        {   // Check MIME type
+        }
+    if (fileName != null) {
+      if (fileName.endsWith(".txt") ||
+      fileName.endsWith(".text"))
+      { this.listRepository.addDocument(new TextDocument(url));
+    // TODO: spawn knowledge of children too...
+        System.out.println(">>> Posting text document " + fileName);
+    return true;
+      }
+    }
+    else
+    { // Check MIME type
+      String mimeType = HTTPTools.getMIMEType(url);
+      if (mimeType == "text/plain")
+      { System.out.println("Posting Text document " + url.toString());
+        TextDocument doc = new TextDocument(url);
+    this.listRepository.addDocument(doc);
+    return true;
+      }
+    }
         return false;
+    }
+    return false;
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: