package org.greenstone.gsdl3.gs3build.doctypes; import java.util.Iterator; import java.util.List; import java.util.ArrayList; import java.io.PrintWriter; import java.io.FileWriter; import java.io.File; import java.io.IOException; import java.net.URL; import java.sql.SQLException; import java.sql.Statement; import java.sql.ResultSet; import org.greenstone.gsdl3.gs3build.metadata.METSFileGroup; import org.greenstone.gsdl3.gs3build.database.GS3SQLConnection; import org.greenstone.gsdl3.gs3build.database.GS3SQLSelect; import org.greenstone.gsdl3.gs3build.database.GS3SQLWhereItem; import org.greenstone.gsdl3.gs3build.database.GS3SQLWhere; import org.greenstone.gsdl3.gs3build.database.GS3SQLField; public class DocumentList { DocumentInterface [] list; // what is currently cached int size; // the maximum number in the cache int used; // the actual number in the cache int count; // the total number of known documents DocumentIDFactoryInterface idFactory; // A manufacturer of novel document IDs GS3SQLConnection connection; // used to query the SQL database private static final int maxSize = 3; public DocumentList(GS3SQLConnection connection) { this.idFactory = null; this.list = new DocumentInterface[3]; this.used = 0; this.size = 3; this.count = 0; this.connection = connection; } public DocumentList(DocumentIDFactoryInterface idFactory, GS3SQLConnection connection) { this.idFactory = idFactory; this.list = new DocumentInterface[3]; this.used = 0; this.size = 3; this.count = 0; this.connection = connection; } /** * Obtain the list of DocumentID objects representing the unique * document identifiers of documents that refer to the file given as a parameter. * * @param URL the location of the file to match * * @return List of DocumentID reference identifiers. */ public List getDocumentIdsWithFile(URL fileLocation) { List reply = new ArrayList(); GS3SQLSelect select = new GS3SQLSelect("files"); select.addField("*"); GS3SQLWhere where = new GS3SQLWhere(new GS3SQLWhereItem("FileLocation", "=", fileLocation.toString())); select.setWhere(where); try { Statement statement = connection.createStatement(); ResultSet results = statement.executeQuery(select.toString()); select = new GS3SQLSelect("filegroups"); select.addField("DocID"); select.setDistinct(true); where = new GS3SQLWhere(); where.setCondition(GS3SQLWhere.OR_CONDITION); GS3SQLWhereItem whereItem = null; results.first(); do { int fileGroupRef = results.getInt("FileGroupRef"); whereItem = new GS3SQLWhereItem("FileGroupRef", "=", Integer.toString(fileGroupRef), GS3SQLField.INTEGER_TYPE); where.add(whereItem); } while (results.next()); select.setWhere(where); results = statement.executeQuery(select.toString()); results.first(); do { String docId = results.getString("DocID"); reply.add(docId); } while (results.next()); statement.close(); } catch (SQLException sqlEx) { System.err.println("DocumentList.getDocumentIdsWithFile(): "+sqlEx); } return reply; } /** * Get a list of documents that match a given set of patterns, * within a given URL node. * * @param List the list of patterns to match * @param String the partial URL of the root node under which o * match files. NB: this is a String as the URL may be * incomplete and not properly match the strict requirements for URL */ public List findDocumentIdsUsingFiles(List fileRefs, String withinNode) { StringBuffer queryBuffer = new StringBuffer("SELECT FileGroupRef FROM files WHERE FileLocation "); Iterator files = fileRefs.iterator(); while (files.hasNext()) { String file = files.next().toString(); if (withinNode != null) { queryBuffer.append("REGEXP \"^"); queryBuffer.append(withinNode); queryBuffer.append(".*"); } else { queryBuffer.append("REGEXP \""); } queryBuffer.append(file); queryBuffer.append("\""); if (files.hasNext()) { queryBuffer.append(" OR "); } } queryBuffer.append(";"); return this.findDocumentIdsUsingFileQuery(queryBuffer.toString()); } public List findDocumentIdsUsingFile(String fileRef, String withinNode) { String query = "SELECT FileGroupRef FROM files WHERE FileLocation REGEXP \"^"+withinNode+".*"+fileRef+"\";"; return this.findDocumentIdsUsingFileQuery(query); } /** * Return a list of document identifiers against a simple pattern. No root node is given, so * any file matching the pattern given will be returned. USE WITH CAUTION!!! * * @param String a fragment of file pathname to match against. * * @return List of DocumentID objects. */ public List findDocumentIdsUsingFile(String fileRef) { // Get the simple list of file objects & their file group reference String query = "SELECT FileGroupRef FROM files WHERE FileLocation REGEXP \"" + fileRef +"\";"; return this.findDocumentIdsUsingFileQuery(query); } public List findDocumentIdsUsingFileExact(String fileRef) { // Get the simple list of file objects & their file group reference String query = "SELECT FileGroupRef FROM files WHERE FileLocation=\"" + fileRef +"\";"; return this.findDocumentIdsUsingFileQuery(query); } private List findDocumentIdsUsingFileQuery(String query) { try { Statement statement = connection.createStatement(); ResultSet results = statement.executeQuery(query); if (!results.first()) { statement.close(); return null; } // get a list of group ids first and turn it into a query on filegroups StringBuffer queryBuffer = new StringBuffer("SELECT * FROM filegroups WHERE "); boolean first = true; do { int groupRef = results.getInt("FileGroupRef"); if (first) { first = false; } else { queryBuffer.append(" OR "); } queryBuffer.append("FileGroupRef=" + Integer.toString(groupRef)); } while (results.next()); queryBuffer.append(";"); // make a holder for the actual file section identifiers List divisions = new ArrayList(); // expand (or, in fact, contract) through the document // structures...recreating new filegroup queries as necessary while (queryBuffer.length() > 0) { results = statement.executeQuery(queryBuffer.toString()); if (!results.first()) { statement.close(); return null; } queryBuffer = new StringBuffer(); do { String type = results.getString("ParentType"); String parentRef = results.getString("ParentRef"); if (type.equals(METSFileGroup.SECTION_PARENT)) { divisions.add(parentRef); } else { if (queryBuffer.length() > 0) { queryBuffer.append(" OR "); } queryBuffer.append("FileGroupRef=" + parentRef); } } while (results.next()); if (queryBuffer.length() > 0) { queryBuffer.insert(0, "SELECT * FROM filegroups WHERE "); queryBuffer.append(";"); } } // ok, now find all the sections in which we are interested... queryBuffer.setLength(0); queryBuffer.append("SELECT DISTINCT DocID FROM filesection WHERE "); Iterator iterator = divisions.iterator(); first = true; while (iterator.hasNext()) { String ref = iterator.next().toString(); if (first) { first = false; } else { queryBuffer.append(" OR "); } queryBuffer.append("FileSectionRef="+ref); } queryBuffer.append(";"); // execute the division query results = statement.executeQuery(queryBuffer.toString()); if (!results.first()) { statement.close(); return null; } List reply = new ArrayList(); do { reply.add(results.getString("DocID")); } while (results.next()); statement.close(); return reply; } catch (SQLException ex) { System.err.println("DocumentList.findDocumentIdsUsingFileQuery()"+ ex); } return null; } /** * Cache a document into the cache, without writing it to the database. * Used directly by other parts of DocumentList when they know that the * document is in the database already, or they are going to write it * themselves... * * @param DocumentInterface the document to cache */ private void cacheDocument(DocumentInterface document) { // increase cache size, etc. as necessary if (this.used == this.size) { if (this.size >= maxSize) { for (int i = 0; i < this.size - 1; i ++) { this.list[i] = this.list[i+1]; } this.used --; } else { this.ensureSize((this.size * 2) > maxSize ? maxSize : (this.size * 2)); } } // insert the document itself this.list[this.used] = document; } /** * Write the document into the document list (cache) and the database. * * @param DocumentInterface the document itself */ public void addDocument(DocumentInterface document) { // initially, test if the document has a duplicate... String duplicateDocID = document.getDuplicateID(this.connection); if (duplicateDocID.length() > 0) { System.out.println("Found duplicate document "); return; } // first cache it... this.cacheDocument(document); // set the document identifier, if not already set if (document.getID() == null) { System.out.println("Posting new docuument ID"); DocumentID id = this.idFactory.getNewDocumentID(document); document.setID(id); } // add to the database as well, if it is modified... if (document.isChanged()) { System.out.println("Document was changed"); document.getSQLWriter().writeDocument(document, this.connection); } // Remember that we've used one more item from the cache. this.used ++; // Note additional document this.count ++; } /** * Note that an individual document is modified, and act accordingly * * @param DocumentInterface the document */ public void storeChangedDocument(DocumentInterface document) { document.getSQLWriter().writeDocument(document, this.connection); } /** * Get an iterator across all the documents, not merely those in * the cache. Note that this Iterator does not * support the remove() function, and will raise an * UnsupportedOperationException if you attempt to do * so. * * @return Iterator the iterator across the documents. */ public Iterator iterator() { return new DocumentListIterator(connection); } /** * Get the nth member of the cached document list. * * @deprecated */ public DocumentInterface getDocument(int index) { if (index < 0 || index >= this.used) { return null; } return this.list[index]; } /** * Simple "obtain a document" function */ public DocumentInterface getDocument(DocumentID documentId) { DocumentInterface document = DocumentFactory.readSQLDocument(connection, documentId); if (document != null) { this.cacheDocument(document); } return document; } /** * Update timestamps on an entire document list - done at the beginning of a build cycle * * @param The date of the new build cycle */ public void updateTimestamps(long buildTimeStamp) { Iterator documents = this.iterator(); int item = 0; while (documents.hasNext()) { DocumentInterface document = (DocumentInterface) documents.next(); long thisTimeStamp = document.getFilesDatestamp(); long lastTimeStamp = document.getModifiedDatestamp(); if (thisTimeStamp > lastTimeStamp) { System.out.println("Updating timestamps " + thisTimeStamp + " " + lastTimeStamp); DocumentSQLWriter.touchDocument(document.getID(), this.connection, buildTimeStamp, thisTimeStamp); } } } /** * A convenience method to map onto the old Vector source code... */ protected void ensureSize(int size) { DocumentInterface [] newList = new DocumentInterface[size]; System.arraycopy(this.list, 0, newList, 0, this.size); this.list = newList; this.size = size; } /** * Write the documents into a directory as METS/XML */ public void writeDocuments(File directory) { Iterator documents = this.iterator(); int item = 0; while (documents.hasNext()) { DocumentInterface document = (DocumentInterface) documents.next(); try { item ++; File localFile = new File(directory, "Doc"+Integer.toString(item)+".xml"); FileWriter fileWriter = new FileWriter(localFile); PrintWriter writer = new PrintWriter(fileWriter); document.getMETSWriter().writeDocument(document, writer); writer.close(); fileWriter.close(); } catch (IOException io) { } } } public void writeSQLDocuments(GS3SQLConnection connection) { for (int i = 0; i < this.used; i ++) { this.list[i].getSQLWriter().writeDocument(this.list[i], connection); } } public static DocumentList readSQLDocuments(GS3SQLConnection connection) { DocumentList list = new DocumentList(connection); GS3SQLSelect select = new GS3SQLSelect("document"); select.addField("*"); try { Statement statement = connection.createStatement(); ResultSet documents = statement.executeQuery(select.toString()); if (documents.first()) { do { DocumentInterface document = AbstractDocument.readSQL(connection, documents); list.addDocument(document); } while (documents.next()); } statement.close(); } catch (java.sql.SQLException ex) { System.out.println("DocumentList.writeSQLDocuments(): "+ex); return null; } return list; } public int getCount() { return this.count; } public int size() { return this.used; } } class DocumentListIterator implements Iterator { private boolean hasNext; private Statement statement; private ResultSet resultSet; private GS3SQLConnection connection; public DocumentListIterator(GS3SQLConnection connection) { this.connection = connection; GS3SQLSelect select = new GS3SQLSelect("document"); select.addField("*"); try { this.statement = connection.createStatement(); this.resultSet = statement.executeQuery(select.toString()); this.hasNext = this.resultSet.first(); } catch (SQLException ex) { System.err.println("DocumentListIterator(): "+ex); this.hasNext = false; } } public boolean hasNext() { return this.hasNext; } public Object next() { // get the 'next' document first DocumentInterface document = AbstractDocument.readSQL(connection, this.resultSet); // now actually step forward to the next item, so that we know if we have one! try { this.hasNext = this.resultSet.next(); if (!this.hasNext) { this.statement.close(); // be a good citizen & close used statement } } catch (SQLException ex) { System.err.println("DocumentList.iterator.next(): "+ex); this.hasNext = false; } return document; } public void remove() throws UnsupportedOperationException { throw new UnsupportedOperationException("DocumentList does not support iterator removal of documents"); } }