Changeset 5944
- Timestamp:
- 2003-11-24T14:26:35+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes
- Files:
-
- 3 added
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java
r5800 r5944 3 3 import java.util.List; 4 4 import java.util.ArrayList; 5 import java.util.Iterator; 5 6 import java.util.HashMap; 6 7 import java.util.Map; 8 9 import java.sql.SQLException; 10 import java.sql.ResultSet; 11 7 12 import java.net.URL; 8 13 … … 19 24 20 25 import org.greenstone.gsdl3.gs3build.util.MultiMap; 26 import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection; 21 27 22 28 /** … … 32 38 METSHeader header; 33 39 DocumentID id; 40 boolean isModified; 41 42 /** 43 * <p>Create a very vanilla document with a given document identifier.</p> 44 * <p>Most commonly used in dealing with loading files using DocumentFactory 45 * or similar.</p> 46 * 47 * @param <code>DocumentID</code> the document identifier 48 */ 49 public AbstractDocument(DocumentID id) 50 { this.fileSet = new METSFileSet(); 51 this.metadata = new METSDescriptiveSet(); 52 this.header = new METSHeader(); 53 this.structureSet = new METSStructureSet(); 54 this.id = id; 55 } 34 56 57 /** 58 * Create a basic document from a given <code>URL</code. This is usually the form 59 * called through the recognisers. 60 * 61 * @param <code>URL</code> the URL of the first file in the document package 62 */ 35 63 public AbstractDocument(URL url) 36 64 { this.fileSet = new METSFileSet(); … … 39 67 this.header = new METSHeader(); 40 68 this.structureSet = new METSStructureSet(); 69 this.id = null; 41 70 42 71 METSStructure structure = new METSStructure("All", "All", "Whole Document"); … … 131 160 namespace = GSDL3Namespace.GSDL3_NAMESPACE_ID; 132 161 } 162 163 // no need to set isModified, as the following call will do it anyway! 133 164 this.addDocumentMetadata(namespace, name, value); 134 165 } … … 139 170 public void addDocumentMetadata(String namespace, String label, String value) 140 171 { this.metadata.addMetadata("default", namespace, label, value); 141 } 142 143 /** 144 * @see DocumentInterace:setDocumentMetadata 172 this.isModified = true; 173 } 174 175 /** 176 * Post metadata to a file in this document - the appropriate changes 177 * should be made... 178 */ 179 public void postFileMetadata(URL fileLocation, String namespace, String label, String value) 180 { 181 // First get the list of file groups, etc. that this file is associated with... 182 List fileGroups = this.fileSet.findGroups(fileLocation); 183 184 // Next, get the METS divisions associated with each file group... 185 List divisions = this.structureSet.findDivisionsForFiles(fileGroups); 186 187 // Finally, post the metadata to the metadata group associated with each structure 188 Iterator divisionIter = divisions.iterator(); 189 while (divisionIter.hasNext()) 190 { METSDivision division = (METSDivision) divisionIter.next(); 191 192 // get the open namespace for this division 193 METSNamespace namespaceMetadata = division.findNamespace(namespace, true, this.metadata); 194 195 // then post the metadata to it... 196 namespaceMetadata.addMetadata(label, value); 197 } 198 } 199 200 /** 201 * @see DocumentInterface:setDocumentMetadata 145 202 */ 146 203 public void setDocumentMetadata(String namespace, String label, String value) 147 204 { this.metadata.setMetadata("default", namespace, label, value); 205 this.isModified = true; 148 206 } 149 207 … … 158 216 159 217 /** 218 * Set the metadata structure for this document 219 * 220 * @param <code>METSDescriptive</code> the new metadata holder for the document. 221 */ 222 public void setDocumentMetadata(METSDescriptiveSet metadata) 223 { this.metadata = metadata; 224 this.isModified = true; 225 } 226 227 /** 160 228 * Get the metadata structure of the document 161 229 * … … 164 232 public METSStructureSet getDocumentStructure() 165 233 { return this.structureSet; 234 } 235 236 public void setDocumentStructure(METSStructureSet structureSet) 237 { this.structureSet = structureSet; 166 238 } 167 239 … … 207 279 { return this.fileSet; 208 280 } 281 282 public void setDocumentFiles(METSFileSet fileSet) 283 { this.fileSet = fileSet; 284 } 209 285 210 286 /** … … 231 307 { return new DocumentSQLWriter(); 232 308 } 233 309 310 /** 311 * Obtain a document from the SQL database 312 */ 313 public static AbstractDocument readSQL(GS3SQLConnection connection, ResultSet sqlResult) 314 { try { 315 DocumentID id = new DocumentID(sqlResult.getString("DocID")); 316 String type = sqlResult.getString("docType"); 317 318 // Use a factory method to create the correct subtype... 319 AbstractDocument document = DocumentFactory.createDocument(type, id); 320 321 // Get the individual components of the document 322 METSFileSet fileSet = METSFileSet.readSQL(document, connection); 323 document.setDocumentFiles(fileSet); 324 METSDescriptiveSet descriptiveSet = METSDescriptiveSet.readSQL(document, connection); 325 document.setDocumentMetadata(descriptiveSet); 326 METSStructureSet structureSet = METSStructureSet.readSQL(document, connection); 327 document.setDocumentStructure(structureSet); 328 329 // indicate that the document is not currently modified 330 document.setModified(false); 331 return document; 332 } 333 catch (SQLException sqlEx) { 334 } 335 return null; 336 } 337 338 /** 339 * 340 */ 341 public boolean isModified() 342 { return this.isModified; 343 } 344 345 public void setModified(boolean isModified) 346 { this.isModified = isModified; 347 } 234 348 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentInterface.java
r5800 r5944 3 3 import java.util.List; 4 4 import java.util.Map; 5 6 import java.net.URL; 5 7 6 8 import org.greenstone.gsdl3.gs3build.metadata.*; … … 10 12 public interface DocumentInterface 11 13 { 12 /** 13 * Get the immediate document type of the document. The type may be derived/inherited 14 * from other DocumentInterface types, but that cannot be checked here. 15 * 16 * @return <code>String</code> the document type as a string 17 */ 18 public String getDocumentType(); 14 /** 15 * Get the immediate document type of the document. The type may be derived/inherited 16 * from other DocumentInterface types, but that cannot be checked here. 17 * 18 * @return <code>String</code> the document type as a string 19 */ 20 public String getDocumentType(); 21 22 /** 23 * Check if this document is of a particular type, or is derived from a particular 24 * type - i.e. inheritance is considered as well as the immediate type. 25 * 26 * @param <code>String</code> the type to check against. 27 * @return <code>boolean</code> if the document matches the given type. 28 */ 29 public boolean isDocumentType(String documentType); 30 31 public void setID(DocumentID id); 32 public DocumentID getID(); 19 33 20 /** 21 * Check if this document is of a particular type, or is derived from a particular 22 * type - i.e. inheritance is considered as well as the immediate type. 23 * 24 * @param <code>String</code> the type to check against. 25 * @return <code>boolean</code> if the document matches the given type. 26 */ 27 public boolean isDocumentType(String documentType); 34 /** 35 * Get the METS type of the document. 36 * 37 * @return <code>String</code> the document type as a string 38 */ 39 public String getMETSType(); 40 41 public void setHeader(METSHeader header); 42 public METSHeader getHeader(); 28 43 29 public void setID(DocumentID id); 30 public DocumentID getID(); 44 /** 45 * Whether the document is indexed or not. 46 * 47 * @return <code>boolean</code> <code>true</code> by default. 48 */ 49 public boolean isIndexed(); 31 50 32 /** 33 * Get the METS type of the document. 34 * 35 * @return <code>String</code> the document type as a string 36 */ 37 public String getMETSType(); 51 /** 52 * The plain text of the document. 53 * 54 * @return <code>String</code> This value may be <code>null</code> 55 * for documents which have no textual component - e.g. 56 * an image file 57 */ 58 public String getDocumentText(); 59 60 /** 61 * The metadata for the document, encoded as a map. 62 * 63 * The returned Map may have the following properties: 64 * 1) It may be <code>null</code> - e.g. for plain text documents 65 * 2) Any value in the map may be a <code>List</code> object containing 66 * more than one possible value - e.g. they key 'Author' may associated 67 * with a List of several people. 68 * 3) Any value in the map may itself be a <code>Map</code> where the 69 * encoding scheme permits groupings of hierarchical metadata items. 70 * 4) Where the namespace for a metadata item is known, its key will 71 * include the namespace. Hence Dublin Core 'Author' would be encoded 72 * as "dc.Author" as the key. 73 * 74 * @return <code>METSDescriptive</code> the metadata of the document 75 */ 76 public METSDescriptiveSet getDocumentMetadata(); 77 78 /** 79 * The metadata for a particular given upon the document, encoded as a <code>List</code> 80 * 81 * @param <code>String</code> the namespace of the metadata 82 * @param <code>String</code> the label of the values to obtain 83 * 84 * @return <code>List</code> the metadata values 85 */ 86 public List getDocumentMetadataItem(String namespace, String label); 87 88 /** 89 * The metadata for the document, encoded as a map. 90 * 91 * @param <code>String</code> the namespace and label of the metadata, separated 92 * by a colon. If no namespace is given, it is 93 * defaulted 94 * 95 * @return <code>List</code> the metadata values 96 */ 97 public List getDocumentMetadataItem(String namespaceLabel); 98 99 /** 100 * Facilitate the decoration of a document with external or extracted 101 * metadata. This is a "cheap" form which doesn't have a separate 102 * namespace element. Either the data is to be stored in the "open" 103 * Greenstone metadata namespace, or the namespace is encoded within 104 * the label. 105 * 106 * @param <code>String</code> label of the metadata, with a '.' to deliminate 107 * sub-component structures. The label may commence 108 * with a namespace followed by a colon. 109 * <p>e.g. "dc:title" for Dublin Core Title.</p> 110 * @param <code>String</code> value of the metadata 111 */ 112 public void addDocumentMetadata(String label, String value); 113 114 /** 115 * Facilitate the decoration of a document with external or extracted 116 * metadata. 117 * 118 * @param <code>String</code> namespace of the metadata 119 * @param <code>String</code> label of the metadata, with a '.' to deliminate 120 * sub-component structures 121 * @param <code>String</code> value of the metadata 122 */ 123 public void addDocumentMetadata(String namespace, String label, String value); 38 124 39 public void setHeader(METSHeader header); 40 public METSHeader getHeader(); 125 126 /** 127 * Post metadata to a file in this document - the appropriate changes 128 * should be made... 129 * 130 * @param <code>URL</code> the location of the file... 131 * @param <code>String</code> the namespace of the metadata 132 * @param <code>String</code> label of the metadata, with a '.' to deliminate 133 * sub-component structures 134 * @param <code>String</code> value of the metadata 135 */ 136 public void postFileMetadata(URL fileLocation, String namespace, String label, String value); 41 137 42 /** 43 * Whether the document is indexed or not. 44 * 45 * @return <code>boolean</code> <code>true</code> by default. 46 */ 47 public boolean isIndexed(); 138 /** 139 * The constituent files for the document: each should be wrapped up 140 * as a URL 141 * 142 * @return <code>METSFileSet</code> the files which constitute the document 143 */ 144 public METSFileSet getDocumentFiles(); 48 145 49 /** 50 * The plain text of the document. 51 * 52 * @return <code>String</code> This value may be <code>null</code> 53 * for documents which have no textual component - e.g. 54 * an image file 55 */ 56 public String getDocumentText(); 57 58 /** 59 * The metadata for the document, encoded as a map. 60 * 61 * The returned Map may have the following properties: 62 * 1) It may be <code>null</code> - e.g. for plain text documents 63 * 2) Any value in the map may be a <code>List</code> object containing 64 * more than one possible value - e.g. they key 'Author' may associated 65 * with a List of several people. 66 * 3) Any value in the map may itself be a <code>Map</code> where the 67 * encoding scheme permits groupings of hierarchical metadata items. 68 * 4) Where the namespace for a metadata item is known, its key will 69 * include the namespace. Hence Dublin Core 'Author' would be encoded 70 * as "dc.Author" as the key. 71 * 72 * @return <code>METSDescriptive</code> the metadata of the document 73 */ 74 public METSDescriptiveSet getDocumentMetadata(); 146 /** 147 * Set the constituent files for the document. 148 */ 149 public void setDocumentFiles(METSFileSet fileSet); 75 150 76 /** 77 * The metadata for the document, encoded as a map. 78 * 79 * @param <code>String</code> the namespace of the metadata 80 * @param <code>String</code> the label of the values to obtain 81 * 82 * @return <code>List</code> the metadata values 83 */ 84 public List getDocumentMetadataItem(String namespace, String label); 151 /** 152 * Obtain the structural information on the document 153 * 154 * @return <code>METSStructureSet</code> the structural information on the 155 * document. 156 */ 157 public METSStructureSet getDocumentStructure(); 158 159 /** 160 * Indicate whether the document can be stored in a native form in 161 * a METS wrapper 162 * 163 * @return <code>boolean</code> <code>true</code> if the document 164 * is just to be wrapped in a METS shell. 165 */ 166 public boolean isMETSCompatible(); 167 168 /** 169 * Write the document into a METS wrapper - for many document types, 170 * this will not actually be done by the document itself, but rather 171 * by the default writer 172 */ 173 public DocumentWriter getMETSWriter(); 174 175 /** 176 * Get the writer to send the document to an SQL database 177 */ 178 public DocumentSQLWriter getSQLWriter(); 85 179 86 /** 87 * The metadata for the document, encoded as a map. 88 * 89 * @param <code>String</code> the namespace and label of the metadata, separated 90 * by a colon. If no namespace is given, it is 91 * defaulted 92 * 93 * @return <code>List</code> the metadata values 94 */ 95 public List getDocumentMetadataItem(String namespaceLabel); 180 /** 181 * Check if the document is changed or not 182 */ 183 public boolean isModified(); 96 184 97 /** 98 * Facilitate the decoration of a document with external or extracted 99 * metadata. This is a "cheap" form which doesn't have a separate 100 * namespace element. Either the data is to be stored in the "open" 101 * Greenstone metadata namespace, or the namespace is encoded within 102 * the label. 103 * 104 * @param <code>String</code> label of the metadata, with a '.' to deliminate 105 * sub-component structures. The label may commence 106 * with a namespace followed by a colon. 107 * <p>e.g. "dc:title" for Dublin Core Title.</p> 108 * @param <code>String</code> value of the metadata 109 */ 110 public void addDocumentMetadata(String label, String value); 111 112 /** 113 * Facilitate the decoration of a document with external or extracted 114 * metadata. 115 * 116 * @param <code>String</code> namespace of the metadata 117 * @param <code>String</code> label of the metadata, with a '.' to deliminate 118 * sub-component structures 119 * @param <code>String</code> value of the metadata 120 */ 121 public void addDocumentMetadata(String namespace, String label, String value); 122 123 /** 124 * The constituent files for the document: each should be wrapped up 125 * as a URL 126 * 127 * @return <code>METSFileSet</code> the files which constitute the document 128 */ 129 public METSFileSet getDocumentFiles(); 130 131 /** 132 * Obtain the structural information on the document 133 * 134 * @return <code>METSStructureSet</code> the structural information on the 135 * document. 136 */ 137 public METSStructureSet getDocumentStructure(); 138 139 /** 140 * Indicate whether the document can be stored in a native form in 141 * a METS wrapper 142 * 143 * @return <code>boolean</code> <code>true</code> if the document 144 * is just to be wrapped in a METS shell. 145 */ 146 public boolean isMETSCompatible(); 147 148 149 /** 150 * Write the document into a METS wrapper - for many document types, 151 * this will not actually be done by the document itself, but rather 152 * by the default writer 153 */ 154 public DocumentWriter getMETSWriter(); 155 156 public DocumentSQLWriter getSQLWriter(); 185 /** 186 * Set the document modified state 187 */ 188 public void setModified(boolean isModified); 157 189 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentList.java
r5800 r5944 1 1 package org.greenstone.gsdl3.gs3build.doctypes; 2 2 3 import java.util.Iterator; 3 4 import java.util.List; 4 5 import java.util.ArrayList; … … 9 10 import java.io.IOException; 10 11 12 import java.net.URL; 13 14 import java.sql.SQLException; 15 import java.sql.ResultSet; 16 11 17 import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection; 18 import org.greenstone.gsdl3.gs3build.database.GS3SQLSelect; 19 import org.greenstone.gsdl3.gs3build.database.GS3SQLWhereItem; 20 import org.greenstone.gsdl3.gs3build.database.GS3SQLWhere; 21 import org.greenstone.gsdl3.gs3build.database.GS3SQLField; 12 22 13 23 public class DocumentList 14 24 { 15 DocumentInterface [] list; 16 int size; 17 int used; 18 DocumentIDFactoryInterface idFactory; 25 DocumentInterface [] list; // what is currently cached 26 int size; // the maximum number in the cache 27 int used; // the actual number in the cache 28 int count; // the total number of known documents 29 DocumentIDFactoryInterface idFactory; // A manufacturer of novel document IDs 30 GS3SQLConnection connection; // used to query the SQL database 31 32 private static final int maxSize = 10; 33 34 public DocumentList(GS3SQLConnection connection) 35 { this.idFactory = null; 36 this.list = new DocumentInterface[10]; 37 this.used = 0; 38 this.size = 10; 39 this.count = 0; 40 this.connection = connection; 41 } 42 43 public DocumentList(DocumentIDFactoryInterface idFactory, GS3SQLConnection connection) 44 { this.idFactory = idFactory; 45 this.list = new DocumentInterface[10]; 46 this.used = 0; 47 this.size = 10; 48 this.count = 0; 49 this.connection = connection; 50 } 51 52 /** 53 * Write the document into the document list (cache) and the database. 54 * 55 * @param <code>DocumentInterface</code> the document itself 56 */ 57 public void addDocument(DocumentInterface document) 58 { // increase cache size, etc. as necessary 59 if (this.used == this.size) { 60 if (this.size >= maxSize) { 61 for (int i = 0; i < this.size - 1; i ++) { 62 this.list[i] = this.list[i+1]; 63 } 64 this.used --; 65 } 66 else { 67 this.ensureSize((this.size * 2) > maxSize ? maxSize : (this.size * 2)); 68 } 69 } 70 71 // insert the document itself 72 this.list[this.used] = document; 73 74 // set the document identifier, if not already set 75 if (document.getID() == null) { 76 DocumentID id = this.idFactory.getNewDocumentID(document); 77 document.setID(id); 78 } 79 80 // add to the database as well 81 document.getSQLWriter().writeDocument(document, this.connection); 82 83 // Remember that we've used one more item from the cache. 84 this.used ++; 85 86 // Note additional document 87 this.count ++; 88 } 89 90 /** 91 * Note that an individual document is modified, and act accordingly 92 * 93 * @param <code>DocumentInterface</code> the document 94 */ 95 public void modifiedDocument(DocumentInterface document) 96 { document.getSQLWriter().writeDocument(document, this.connection); 97 } 98 99 /** 100 * Get an iterator across all the documents, not merely those in 101 * the cache. Note that this <code>Iterator</code> does <b>not</b> 102 * support the <code>remove()</code> function, and will raise an 103 * <code>UnsupportedOperationException</code> if you attempt to do 104 * so. 105 * 106 * @return <code>Iterator</code> the iterator across the documents. 107 */ 108 public Iterator iterator() 109 { return new DocumentListIterator(connection); 110 } 111 112 /** 113 * Get the nth member of the <b>cached</b> document list. 114 * 115 * @deprecated 116 */ 117 public DocumentInterface getDocument(int index) 118 { if (index < 0 || index >= this.used) 119 { return null; 120 } 121 return this.list[index]; 122 } 123 124 /** 125 public DocumentID getDocumentID(int index) 126 { if (index < 0 || index >= this.used) 127 { return null; 128 } 129 return this.list[index].getID(); 130 } 131 */ 132 133 protected void ensureSize(int size) 134 { DocumentInterface [] newList = new DocumentInterface[size]; 135 System.arraycopy(this.list, 0, newList, 0, this.size); 136 this.list = newList; 137 this.size = size; 138 } 139 140 public void writeDocuments(File directory) 141 { Iterator documents = this.iterator(); 142 int item = 0; 143 144 while (documents.hasNext()) 145 { DocumentInterface document = (DocumentInterface) documents.next(); 146 try 147 { item ++; 148 File localFile = new File(directory, "Doc"+Integer.toString(item)+".xml"); 149 FileWriter fileWriter = new FileWriter(localFile); 150 PrintWriter writer = new PrintWriter(fileWriter); 151 document.getMETSWriter().writeDocument(document, writer); 152 writer.close(); 153 fileWriter.close(); 154 } 155 catch (IOException io) 156 { 157 } 158 } 159 } 160 161 public void writeSQLDocuments(GS3SQLConnection connection) 162 { for (int i = 0; i < this.used; i ++) 163 { this.list[i].getSQLWriter().writeDocument(this.list[i], connection); 164 } 165 } 166 167 public static DocumentList readSQLDocuments(GS3SQLConnection connection) 168 { DocumentList list = new DocumentList(connection); 169 170 GS3SQLSelect select = new GS3SQLSelect("document"); 171 select.addField("*"); 172 173 ResultSet documents; 174 try { 175 connection.execute(select.toString()); 176 documents = connection.getResultSet(); 177 178 if (documents.first()) 179 { do 180 { DocumentInterface document = AbstractDocument.readSQL(connection, documents); 181 list.addDocument(document); 182 } 183 while (documents.next()); 184 } 185 } 186 catch (java.sql.SQLException ex) 187 { System.out.println(ex); 188 return null; 189 } 190 191 return list; 192 } 193 194 public int getCount() 195 { return this.count; 196 } 197 198 public int size() 199 { return this.used; 200 } 201 } 202 203 class DocumentListIterator implements Iterator 204 { 205 private boolean hasNext; 206 private ResultSet resultSet; 207 private GS3SQLConnection connection; 208 209 public DocumentListIterator(GS3SQLConnection connection) 210 { 211 this.connection = connection; 212 213 GS3SQLSelect select = new GS3SQLSelect("document"); 214 select.addField("*"); 215 216 try { 217 connection.execute(select.toString()); 218 this.resultSet = connection.getResultSet(); 219 this.hasNext = this.resultSet.first(); 220 } catch (SQLException ex) { 221 this.hasNext = false; 222 } 223 } 224 225 public boolean hasNext() 226 { return this.hasNext; 227 } 228 229 public Object next() 230 { 231 // get the 'next' document first 232 DocumentInterface document = AbstractDocument.readSQL(connection, this.resultSet); 233 234 // now actually step forward to the next item, so that we know if we have one! 235 try { 236 this.hasNext = this.resultSet.next(); 237 238 if (!this.hasNext) { 239 this.resultSet.close(); // be a good citizen & close used result sets 240 } 241 } catch (SQLException ex) { 242 this.hasNext = false; 243 } 244 return document; 245 } 246 247 public void remove() throws UnsupportedOperationException 248 { throw new UnsupportedOperationException("DocumentList does not support iterator removal of documents"); 249 } 250 251 public List getDocumentIdsWithFile(URL fileLocation) 252 { List reply = new ArrayList(); 253 254 GS3SQLSelect select = new GS3SQLSelect("files"); 255 select.addField("*"); 256 GS3SQLWhere where = new GS3SQLWhere(new GS3SQLWhereItem("FileLocation", "=", fileLocation.toString())); 257 select.setWhere(where); 258 259 this.connection.execute(select.toString()); 260 261 ResultSet results = this.connection.getResultSet(); 262 if (results != null) { 263 select = new GS3SQLSelect("filegroups"); 264 select.addField("DocID"); 265 select.setDistinct(true); 266 267 where = new GS3SQLWhere(); 268 where.setCondition(GS3SQLWhere.OR_CONDITION); 269 270 GS3SQLWhereItem whereItem = null; 271 272 try { 273 results.first(); 274 do { 275 int fileGroupRef = results.getInt("FileGroupRef"); 276 whereItem = new GS3SQLWhereItem("FileGroupRef", "=", Integer.toString(fileGroupRef), GS3SQLField.INTEGER_TYPE); 277 where.add(whereItem); 278 } 279 while (results.next()); 280 select.setWhere(where); 281 results.close(); 19 282 20 public DocumentList(DocumentIDFactoryInterface idFactory) 21 { this.idFactory = idFactory; 22 this.list = new DocumentInterface[10]; 23 this.used = 0; 24 this.size = 10; 25 } 26 27 public void addDocument(DocumentInterface document) 28 { if (this.used == this.size) { 29 this.ensureSize(this.size * 2); 30 } 31 this.list[this.used] = document; 32 DocumentID id = this.idFactory.getNewDocumentID(document); 33 document.setID(id); 34 this.used ++; 35 } 36 37 public DocumentInterface getDocument(int index) 38 { if (index < 0 || index >= this.used) 39 { return null; 40 } 41 return this.list[index]; 42 } 43 44 public DocumentID getDocumentID(int index) 45 { if (index < 0 || index >= this.used) 46 { return null; 47 } 48 return this.list[index].getID(); 49 } 50 51 public void ensureSize(int size) 52 { DocumentInterface [] newList = new DocumentInterface[size]; 53 System.arraycopy(this.list, 0, newList, 0, this.size); 54 this.list = newList; 55 this.size = size; 56 } 57 58 public void writeDocuments(File directory) 59 { for (int i = 0; i < this.used; i ++) 60 { try 61 { 62 File localFile = new File(directory, "Doc"+Integer.toString(i)+".xml"); 63 FileWriter fileWriter = new FileWriter(localFile); 64 PrintWriter writer = new PrintWriter(fileWriter); 65 this.list[i].getMETSWriter().writeDocument(this.list[i], writer); 66 writer.close(); 67 fileWriter.close(); 68 } 69 catch (IOException io) 70 { 71 } 72 } 73 } 74 75 public void writeSQLDocuments(GS3SQLConnection connection) 76 { for (int i = 0; i < this.used; i ++) 77 { this.list[i].getSQLWriter().writeDocument(this.list[i], connection); 78 } 79 } 80 81 public int size() 82 { return this.used; 83 } 283 this.connection.execute(select.toString()); 284 285 results = this.connection.getResultSet(); 286 results.first(); 287 do { 288 String docId = results.getString("DocID"); 289 reply.add(docId); 290 } while (results.next()); 291 } 292 catch (SQLException sqlEx) 293 { System.err.println(sqlEx); 294 } 295 } 296 return reply; 297 } 84 298 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentLoader.java
r5800 r5944 6 6 public class DocumentLoader 7 7 { 8 9 {StringBuffer reply;10 byte data[] = new byte[128];11 8 public static String getAsString(InputStream in) 9 { StringBuffer reply; 10 byte data[] = new byte[1024]; 11 int databytes; 12 12 13 reply = new StringBuffer(); 13 reply = new StringBuffer(); 14 15 try 16 { 17 do 18 { databytes = in.read(data); 19 if (databytes > 0) 20 { reply.append(new String(data, 0, databytes)); 21 } 22 } while (databytes >= 0); 23 } 24 catch (IOException io) 25 { 26 } 27 28 return reply.toString(); 29 } 14 30 15 try 16 { 17 do 18 { databytes = in.read(data); 19 if (databytes > 0) 20 { reply.append(new String(data, 0, databytes)); 21 } 22 } while (databytes >= 0); 23 } 24 catch (IOException io) 25 { 26 } 31 public static String getAsString(File file) 32 { FileInputStream in; 33 String reply = null; 27 34 28 return reply.toString(); 35 try 36 { in = new FileInputStream(file); 37 if (in == null) 38 { return null; 29 39 } 40 reply = getAsString(in); 41 42 in.close(); 43 } 44 catch (IOException io) 45 { return null; 46 } 47 return reply; 48 } 30 49 31 public static String getAsString(File file) 32 { FileInputStream in; 33 String reply = null; 34 35 try 36 { in = new FileInputStream(file); 37 if (in == null) 38 { return null; 39 } 40 reply = getAsString(in); 41 42 in.close(); 43 } 44 catch (IOException io) 45 { return null; 46 } 47 return reply; 48 } 49 50 public static String getAsString(URL url) 51 { if (url.toString().startsWith("file://")) 52 { File file = new File(url.toString().substring(7)); 53 return getAsString(file); 54 } 55 return null; 56 } 50 public static String getAsString(URL url) 51 { if (url.toString().startsWith("file://")) 52 { File file = new File(url.toString().substring(7)); 53 return getAsString(file); 54 } 55 else if (url.toString().startsWith("file:/")) 56 { File file = new File(url.toString().substring(5)); 57 return getAsString(file); 58 } 59 60 return null; 61 } 57 62 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentSQLWriter.java
r5800 r5944 25 25 // put the document into the database 26 26 try { 27 if (document.getID() != null)27 if (document.getID() != null) 28 28 { //tag = XMLTools.addAttribute(tag, "OBJID", document.getID().toString()); 29 GS3SQLInsert insert = new GS3SQLInsert("document"); 29 GS3SQLSelect select = new GS3SQLSelect("document"); 30 select.addField("*"); 31 select.setWhere(new GS3SQLWhere(new GS3SQLWhereItem("DocID", "=", document.getID().toString()))); 32 connection.execute(select.toString()); 33 34 ResultSet results = connection.getResultSet(); 35 36 if (results == null || 37 !results.first()) 38 { GS3SQLInsert insert = new GS3SQLInsert("document"); 30 39 insert.addValue("DocID", document.getID().toString()); 40 insert.addValue("DocType", document.getDocumentType()); 31 41 32 System.out.println(insert.toString());33 42 connection.execute(insert.toString()); 43 } 34 44 } 35 45 } catch (Exception ex) { -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/GMLRecogniser.java
r5800 r5944 8 8 public class GMLRecogniser implements RecogniserInterface 9 9 { 10 10 DocumentList listRepository; 11 11 12 13 {this.listRepository = listRepository;14 12 public GMLRecogniser(DocumentList listRepository) 13 { this.listRepository = listRepository; 14 } 15 15 16 17 18 19 20 21 22 23 24 25 16 public boolean parseDocument(METSFile file) 17 { 18 String MIMEType = file.getMIMEType(); 19 if (MIMEType == null || 20 MIMEType.equals("text/xml")) { 21 URL location = file.getLocation(); 22 return this.parseDocument(location); 23 } 24 return false; 25 } 26 26 27 28 {if (url.toString().startsWith("file://")) {29 30 31 32 33 34 35 36 // System.out.println(doc.getDocumentText());37 38 39 40 41 42 43 44 27 public boolean parseDocument(URL url) 28 { if (url.toString().startsWith("file://")) { 29 String fileName = url.toString().substring(7); 30 if (fileName.endsWith(".gml")) 31 { 32 System.out.println("Posting GML Document " + fileName); 33 GMLDocument doc = new GMLDocument(url); 34 this.listRepository.addDocument(doc); 35 // TODO: spawn knowledge of children too... 36 // System.out.println(doc.getDocumentText()); 37 return true; 38 } 39 } 40 else { 41 // TODO: get Mime type remotely, and then proceed if required 42 } 43 return false; 44 } 45 45 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java
r5800 r5944 14 14 public class HTMLDocument extends AbstractDocument 15 15 { 16 16 public static final String HTML_DOCUMENT_TYPE = "HTML"; 17 17 18 /** 19 * Create the HTMLDocument from a given URL - the URL may in fact be a reference 20 * to a local file. 21 * 22 * @param <code>URL</code> The location from which to load the file 23 */ 24 public HTMLDocument(URL url) 25 { super(url); 18 public HTMLDocument(DocumentID id) 19 { super(id); 20 } 26 21 27 HTMLDoc htmlDoc; 28 if (url.toString().startsWith("file://")) 29 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 30 } 31 else 32 { htmlDoc = new HTMLDoc(url); 33 } 22 /** 23 * Create the HTMLDocument from a given URL - the URL may in fact be a reference 24 * to a local file. 25 * 26 * @param <code>URL</code> The location from which to load the file 27 */ 28 public HTMLDocument(URL url) 29 { super(url); 34 30 35 this._extractDocumentFiles(htmlDoc); 36 this._extractDocumentMetadata(htmlDoc); 31 HTMLDoc htmlDoc; 32 if (url.toString().startsWith("file://")) 33 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 34 } 35 else if (url.toString().startsWith("file:/")) 36 { htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 37 } 38 else 39 { htmlDoc = new HTMLDoc(url); 40 } 41 42 this._extractDocumentFiles(htmlDoc); 43 this._extractDocumentMetadata(htmlDoc); 44 } 45 46 private void _extractDocumentMetadata(HTMLDoc htmlDoc) 47 { HTMLBlock codedContent = htmlDoc.getCodedContent(); 48 boolean inTitle = false; 49 StringBuffer title = new StringBuffer(); 50 51 for (int e = 0; e < codedContent.size(); e ++) 52 { if (codedContent.elementAt(e) instanceof HTMLTag) 53 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 54 55 if (tag.tagName().equals("meta")) 56 { // check that the name of the metadata item exists 57 String name = tag.idValue("name"); 58 if (name == null || name.length() == 0) { 59 continue; 60 } 61 62 // get the value, if it exists 63 String value = tag.idValue("content"); 64 if (value != null && value.length() > 0) { 65 System.out.println(" " + value); 66 } 67 // if value does not exist, default it to being the same 68 // as the name. 69 else { 70 value = name; 71 } 72 73 this.addDocumentMetadata(name, value); 37 74 } 75 else if (tag.tagName().equals("title")) 76 { inTitle = true; 77 } 78 else if (tag.tagName().equals("/title")) 79 { inTitle = false; 80 } 81 // cut off when real body content appears - not a perfect 82 // implementation, just cheap & cheerful 83 else if (tag.tagName().equals("/head")) 84 { break; 85 } 86 else if (tag.tagName().equals("body")) 87 { break; 88 } 89 } 90 else if (inTitle == true) 91 { title.append(codedContent.elementAt(e).toString()); 92 } 93 } 94 if (title.length() > 0) 95 { this.addDocumentMetadata("title", title.toString()); 96 } 97 } 38 98 39 private void _extractDocumentMetadata(HTMLDoc htmlDoc) 40 { HTMLBlock codedContent = htmlDoc.getCodedContent(); 41 boolean inTitle = false; 42 StringBuffer title = new StringBuffer(); 99 private void _extractDocumentFiles(HTMLDoc htmlDoc) 100 { URL homeUrl = this.fileSet.getFile(0).getLocation(); 43 101 44 for (int e = 0; e < codedContent.size(); e ++) 45 { if (codedContent.elementAt(e) instanceof HTMLTag) 46 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 102 HTMLBlock codedContent = htmlDoc.getCodedContent(); 103 for (int e = 0; e < codedContent.size(); e ++) 104 { if (codedContent.elementAt(e) instanceof HTMLTag) 105 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 47 106 48 if (tag.tagName().equals("meta")) 49 { // check that the name of the metadata item exists 50 String name = tag.idValue("name"); 51 if (name == null || name.length() == 0) { 52 continue; 53 } 107 if (tag.tagName().equals("img")) 108 { String location = tag.idValue("src"); 54 109 55 // get the value, if it exists 56 String value = tag.idValue("content"); 57 if (value != null && value.length() > 0) { 58 System.out.println(" " + value); 59 } 60 // if value does not exist, default it to being the same 61 // as the name. 62 else { 63 value = name; 64 } 65 66 this.addDocumentMetadata(name, value); 67 } 68 else if (tag.tagName().equals("title")) 69 { inTitle = true; 70 } 71 else if (tag.tagName().equals("/title")) 72 { inTitle = false; 73 } 74 // cut off when real body content appears - not a perfect 75 // implementation, just cheap & cheerful 76 else if (tag.tagName().equals("/head")) 77 { break; 78 } 79 else if (tag.tagName().equals("body")) 80 { break; 81 } 82 } 83 else if (inTitle == true) 84 { title.append(codedContent.elementAt(e).toString()); 85 } 86 } 87 if (title.length() > 0) 88 { this.addDocumentMetadata("title", title.toString()); 89 } 90 } 91 92 private void _extractDocumentFiles(HTMLDoc htmlDoc) 93 { URL homeUrl = this.fileSet.getFile(0).getLocation(); 94 95 HTMLBlock codedContent = htmlDoc.getCodedContent(); 96 for (int e = 0; e < codedContent.size(); e ++) 97 { if (codedContent.elementAt(e) instanceof HTMLTag) 98 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 99 100 if (tag.tagName().equals("img")) 101 { String location = tag.idValue("src"); 102 103 try 104 { // make the url for the image, and then add it to the document list of 105 // 106 URL imgUrl = new URL(homeUrl, location); 107 METSFile file = this.fileSet.addFile(imgUrl); 108 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID()); 109 } 110 catch (MalformedURLException ex) 111 { // TODO: report exception/failure to resolve... 112 } 110 try 111 { // make the url for the image, and then add it to the document list of 112 // 113 URL imgUrl = new URL(homeUrl, location); 114 METSFile file = this.fileSet.addFile(imgUrl); 115 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID()); 116 } 117 catch (MalformedURLException ex) 118 { // TODO: report exception/failure to resolve... 113 119 } 114 120 } 115 121 } 116 122 } 123 } 117 124 118 public String getDocumentType() 119 { return HTML_DOCUMENT_TYPE; 125 public String getDocumentType() 126 { return HTML_DOCUMENT_TYPE; 127 } 128 129 public String getDocumentText() 130 { 131 HTMLDoc htmlDoc; 132 URL url =(URL) this.fileSet.getFile(0).getLocation(); 133 134 if (url.toString().startsWith("file://")) 135 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 120 136 } 121 122 public String getDocumentText() 123 { HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7)); 124 return htmlDoc.getContent(); 137 else if (url.toString().startsWith("file:/")) 138 { htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 125 139 } 140 else 141 { htmlDoc = new HTMLDoc(url); 142 } 143 return htmlDoc.getContent(); 144 } 126 145 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLRecogniser.java
r5800 r5944 9 9 public class HTMLRecogniser implements RecogniserInterface 10 10 { 11 11 DocumentList listRepository; 12 12 13 public HTMLRecogniser(DocumentList listRepository) 14 { this.listRepository = listRepository; 15 } 13 public HTMLRecogniser(DocumentList listRepository) 14 { this.listRepository = listRepository; 15 } 16 17 public boolean parseDocument(METSFile file) 18 { 19 String MIMEType = file.getMIMEType(); 20 if (MIMEType == null || 21 MIMEType.equals("text/html")) { 22 URL location = file.getLocation(); 23 return this.parseDocument(location); 24 } 25 return false; 26 } 16 27 17 public boolean parseDocument(METSFile file) 18 { 19 String MIMEType = file.getMIMEType(); 20 if (MIMEType == null || 21 MIMEType.equals("text/html")) { 22 URL location = file.getLocation(); 23 return this.parseDocument(location); 24 } 25 return false; 26 } 28 public boolean parseDocument(URL url) 29 { String fileName = null; 27 30 28 public boolean parseDocument(URL url) 29 { if (url.toString().startsWith("file://")) { 30 String fileName = url.toString().substring(7); 31 if (fileName.endsWith(".htm") || 32 fileName.endsWith(".html")) 33 { System.out.println("Posting HTML Document " + fileName); 31 if (url.toString().startsWith("file://")) { 32 fileName = url.toString().substring(7); 33 } 34 else if (url.toString().startsWith("file:/")) { 35 fileName = url.toString().substring(5); 36 } 34 37 35 HTMLDocument doc = new HTMLDocument(url); 36 this.listRepository.addDocument(doc); 37 return true; 38 } 39 } 40 else { 41 // Get Mime type remotely, and then proceed if required 42 String mimeType = HTTPTools.getMIMEType(url); 38 if (fileName != null) { 39 if (fileName.endsWith(".htm") || 40 fileName.endsWith(".html")) 41 { System.out.println("Posting HTML Document " + fileName); 43 42 44 if (mimeType == "text/html") 45 { System.out.println("Posting HTML Document " + url.toString()); 43 HTMLDocument doc = new HTMLDocument(url); 44 this.listRepository.addDocument(doc); 45 return true; 46 } 47 } 48 else { 49 // Get Mime type remotely, and then proceed if required 50 String mimeType = HTTPTools.getMIMEType(url); 51 52 if (mimeType == "text/html") 53 { System.out.println("Posting HTML Document " + url.toString()); 46 54 47 48 49 50 51 52 53 55 HTMLDocument doc = new HTMLDocument(url); 56 this.listRepository.addDocument(doc); 57 return true; 58 } 59 } 60 return false; 61 } 54 62 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/RecogniserManager.java
r5800 r5944 8 8 public class RecogniserManager implements FileCrawlObserver 9 9 { 10 11 12 10 RecogniserInterface list[]; 11 int used; 12 int size; 13 13 14 15 {this.list = new RecogniserInterface[10];16 17 18 14 public RecogniserManager() 15 { this.list = new RecogniserInterface[10]; 16 this.used = 0; 17 this.size = 10; 18 } 19 19 20 21 {this.ensureCapacity(this.used + 1);20 public void addRecogniser(RecogniserInterface recogniser) 21 { this.ensureCapacity(this.used + 1); 22 22 23 24 25 23 this.list[this.used] = recogniser; 24 this.used ++; 25 } 26 26 27 28 {boolean result;27 public void processFile(URL url) 28 { boolean result; 29 29 30 31 {if (list[r].parseDocument(url)) {32 33 34 35 30 for (int r = 0; r < this.used; r ++) 31 { if (list[r].parseDocument(url)) { 32 break; 33 } 34 } 35 } 36 36 37 public void processFile(File file) 38 { try { 39 URL url = new URL("file://"+file.toString()); 37 public void processFile(File file) 38 { try { 39 URL url = new URL("file://"+file.toString()); 40 41 this.processFile(url); 42 } 43 catch (java.net.MalformedURLException ex) 44 { 45 System.out.println(ex); 46 } 47 } 40 48 41 this.processFile(url); 42 } 43 catch (java.net.MalformedURLException ex) 44 { 45 System.out.println(ex); 46 } 47 } 48 49 private void ensureCapacity(int size) 50 { while (size >= this.size) 51 { RecogniserInterface newList [] = new RecogniserInterface[this.size*2]; 52 this.size *= 2; 53 System.arraycopy(this.list, 0, newList, 0, this.size); 54 this.list = newList; 55 } 56 } 49 private void ensureCapacity(int size) 50 { while (size >= this.size) 51 { RecogniserInterface newList [] = new RecogniserInterface[this.size*2]; 52 this.size *= 2; 53 System.arraycopy(this.list, 0, newList, 0, this.size); 54 this.list = newList; 55 } 56 } 57 57 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/TextDocument.java
r5800 r5944 15 15 public class TextDocument extends AbstractDocument 16 16 { 17 17 public static final String TEXT_DOCUMENT_TYPE = "Text"; 18 18 19 public TextDocument(URL url)20 { super(url);21 19 public TextDocument(DocumentID id) 20 { super(id); 21 } 22 22 23 public String getDocumentType()24 { return TEXT_DOCUMENT_TYPE;25 23 public TextDocument(URL url) 24 { super(url); 25 } 26 26 27 /** 28 * A pretty minimal and lazy document text extraction process. 29 */ 30 public String getDocumentText() 31 { return DocumentLoader.getAsString((URL) this.fileSet.getFile(0).getLocation()); 32 } 27 public String getDocumentType() 28 { return TEXT_DOCUMENT_TYPE; 29 } 30 31 /** 32 * A pretty minimal and lazy document text extraction process. 33 */ 34 public String getDocumentText() 35 { return DocumentLoader.getAsString((URL) this.fileSet.getFile(0).getLocation()); 36 } 33 37 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/TextRecogniser.java
r5800 r5944 5 5 6 6 import org.greenstone.gsdl3.gs3build.metadata.*; 7 import org.greenstone.gsdl3.gs3build.util.HTTPTools; 7 8 8 9 public class TextRecogniser implements RecogniserInterface 9 10 { 10 DocumentList listRepository; 11 DocumentList listRepository; 12 13 public TextRecogniser(DocumentList listRepository) 14 { this.listRepository = listRepository; 15 } 11 16 12 public TextRecogniser(DocumentList listRepository) 13 { this.listRepository = listRepository; 14 } 17 public boolean parseDocument(METSFile file) 18 { 19 String MIMEType = file.getMIMEType(); 20 if (MIMEType == null || 21 MIMEType.equals("text/plain")) { 22 URL location = file.getLocation(); 23 return this.parseDocument(location); 24 } 25 return false; 26 } 15 27 16 public boolean parseDocument(METSFile file) 17 { 18 String MIMEType = file.getMIMEType(); 19 if (MIMEType == null || 20 MIMEType.equals("text/plain")) { 21 URL location = file.getLocation(); 22 return this.parseDocument(location); 23 } 24 return false; 25 } 28 public boolean parseDocument(URL url) 29 { String fileName = null; 26 30 27 public boolean parseDocument(URL url) 28 { if (url.toString().startsWith("file://")) { 29 String fileName = url.toString().substring(7); 31 if (url.toString().startsWith("file://")) { 32 fileName = url.toString().substring(7); 33 } 34 else if (url.toString().startsWith("file:/")) { 35 fileName = url.toString().substring(5); 36 } 30 37 31 if (fileName.endsWith(".txt") || 32 fileName.endsWith(".text")) 33 { this.listRepository.addDocument(new TextDocument(url)); 34 // TODO: spawn knowledge of children too... 35 System.out.println(">>> Posting text document " + fileName); 36 return true; 37 } 38 } 39 else 40 { // Check MIME type 41 } 38 if (fileName != null) { 39 if (fileName.endsWith(".txt") || 40 fileName.endsWith(".text")) 41 { this.listRepository.addDocument(new TextDocument(url)); 42 // TODO: spawn knowledge of children too... 43 System.out.println(">>> Posting text document " + fileName); 44 return true; 45 } 46 } 47 else 48 { // Check MIME type 49 String mimeType = HTTPTools.getMIMEType(url); 50 51 if (mimeType == "text/plain") 52 { System.out.println("Posting Text document " + url.toString()); 53 54 TextDocument doc = new TextDocument(url); 55 this.listRepository.addDocument(doc); 56 return true; 57 } 58 } 42 59 43 44 60 return false; 61 } 45 62 }
Note:
See TracChangeset
for help on using the changeset viewer.