Changeset 8408


Ignore:
Timestamp:
2004-10-22T13:56:15+13:00 (20 years ago)
Author:
schweer
Message:

George's changes to detect documents that are new or have changed since the last build process. (his CVS account currently doesn't work)

Location:
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build
Files:
1 added
14 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/BuildManager.java

    r7210 r8408  
    111111    // TODO: add expansion (e.g. Zip files)
    112112   
     113    // Crawl the file tree - will recognise documents
    113114    for (int i = 0; i < this.inputRoots.size(); i ++)
    114115    { FileCrawler fileCrawler = new FileCrawler(new File((String) this.inputRoots.get(i)), recogniserManager);
     
    116117      fileCrawler.crawl();
    117118    }
     119
     120    // Extract phase, etc.
    118121    this.extractorManager.extractDocuments();
    119122    this.classifierManager.classifyDocuments();
    120123    this.indexerManager.indexDocuments();
     124
     125    // Timestamp management - update all timestamps on modified dates...
     126    //
     127    // This should only occur at the end of building in case the build is cancelled...
     128    docList.updateTimestamps(this.collectionManager.getBuildDate());
    121129
    122130    // TODO: validation phase
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/CollectionManager.java

    r7478 r8408  
    155155    this.siteHome = GSFile.siteHome(gsdl3Root, site);
    156156    File site_dir = new File(this.siteHome);
     157    System.out.println(site_dir);
    157158    if (!site_dir.exists()) {
    158159      System.out.println("Error: Non-existant site ("+site+") specified");
     
    378379  {
    379380    return this.database;
     381  }
     382
     383  public Date getBuildDate()
     384  { return this.lastBuildDate.getTime();
    380385  }
    381386
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/classifier/ClassifierManager.java

    r7470 r8408  
    8585        }
    8686       
    87         if (document.isModified()) {
     87        if (document.isChanged()) {
    8888          //          System.out.println("Writing document " + document.getID());
    89           this.documents.modifiedDocument(document);
     89          this.documents.storeChangedDocument(document);
    9090        }
    9191      }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/classifier/GS2HierarchyClassifier.java

    r7470 r8408  
    3737    public void recordClassification(String label)
    3838    { this.document.addDocumentMetadata("gsdl3", "classified", label);
    39       this.document.setModified(true);
     39      this.document.setChanged(true);
    4040      //      System.out.println("Assigned document " + this.document.getID().toString() + " to " + label);
    4141    }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java

    r7466 r8408  
    4646  java.sql.Timestamp firstDate;
    4747  java.sql.Timestamp indexDate;
     48  java.sql.Timestamp modifiedDate;
    4849
    4950  /**
     
    6566    this.firstDate = new java.sql.Timestamp(thisDate.getTime());
    6667    this.indexDate = new java.sql.Timestamp(thisDate.getTime());
     68    this.modifiedDate = null; // as a signature that the modified date needs finding...
    6769  }
    6870 
     
    131133   *  Get the date that this file was modified
    132134   */
     135  public long getFilesDatestamp()
     136  { return this.fileSet.getModifiedDatestamp();
     137  }
     138
     139  /**
     140   *  Get the date that this file was modified
     141   */
    133142  public long getModifiedDatestamp()
    134   { return this.fileSet.getModifiedDatestamp();
     143  { if (this.modifiedDate == null) {
     144      this.setModifiedDatestamp();
     145    }
     146    return this.modifiedDate.getTime();
     147  }
     148
     149  /**
     150   *  Update/set the date of the most recent file modification
     151   */
     152  public void setModifiedDatestamp()
     153  { this.modifiedDate = new java.sql.Timestamp(this.fileSet.getModifiedDatestamp());
     154  }
     155
     156  /**
     157   *  Get the date that this document was first indexed
     158   */
     159  public long getAccessionDate()
     160  { return this.firstDate.getTime();
     161  }
     162
     163  /**
     164   *  Get the date that this document was last indexed
     165   */
     166  public long getLastIndexedDate()
     167  { return this.indexDate.getTime();
     168  }
     169
     170  /**
     171   *  Set the last indexed date for this document;
     172   */
     173  public void setLastIndexedDate(long timestamp)
     174  { this.indexDate = new java.sql.Timestamp(timestamp);
    135175  }
    136176
     
    145185   *                                 database.
    146186   */
    147   public boolean hasDuplicate(GS3SQLConnection connection)
     187  public String getDuplicateID(GS3SQLConnection connection)
    148188  { //String query = "SELECT * FROM document INNER JOIN filegroups ON document.docId=filegroups.docId WHERE DocType=\"" + HTML_DOCUMENT_TYPE + "\"";
    149189
     
    173213        String docType = innerSet.getString("DocType");
    174214        if (docType.equals(this.getDocumentType())) {
    175           return true;
     215          return docId;
    176216        }
    177217      }
     
    183223    }
    184224
    185     return false;
     225    return "";
    186226  }
    187227
     
    416456
    417457      // Append the document date information
    418       document.indexDate = sqlResult.getTimestamp("IndexedDate");
    419       document.firstDate = sqlResult.getTimestamp("AccessionDate");
     458      document.indexDate    = sqlResult.getTimestamp("IndexedDate");
     459      document.firstDate    = sqlResult.getTimestamp("AccessionDate");
     460      document.modifiedDate = sqlResult.getTimestamp("ModifiedDate");
    420461
    421462      // Get the individual components of the document
     
    428469
    429470      // indicate that the document is not currently modified
    430       document.setModified(false);
     471      document.setChanged(false);
    431472      return document;
    432473    }
     
    440481   * 
    441482   */
    442   public boolean isModified()
     483  public boolean isChanged()
    443484  { return this.isModified;
    444485  }
    445486
    446   public void setModified(boolean isModified)
     487  public void setChanged(boolean isModified)
    447488  { this.isModified = isModified;
    448489  }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentInterface.java

    r7466 r8408  
    5151
    5252  /**
    53    *  Get the date that this file was modified
     53   *  Get the latest date that a component file was modified...
     54   */
     55  public long getFilesDatestamp();
     56
     57  /**
     58   *  Get the stored modified date for the files datestamp...
    5459   */
    5560  public long getModifiedDatestamp();
    5661
    5762  /**
     63   *
     64   */
     65  public void setModifiedDatestamp();
     66
     67  /**
     68   *  Get the date that this document was first indexed
     69   */
     70  public long getAccessionDate();
     71
     72  /**
     73   *  Get the date that this document was last indexed
     74   */
     75  public long getLastIndexedDate();
     76
     77  /**
     78   *  Set the last indexed date for this document;
     79   */
     80  public void setLastIndexedDate(long date);
     81
     82  /**
    5883   *  Check if the document matches another in the database
    5984   */
    60   public boolean hasDuplicate(GS3SQLConnection connection);
     85  public String getDuplicateID(GS3SQLConnection connection);
    6186   
    6287  /**
     
    249274   *  Check if the document is changed or not
    250275   */
    251   public boolean isModified();
     276  public boolean isChanged();
    252277
    253278  /**
    254279   *  Set the document modified state
    255280   */
    256   public void setModified(boolean isModified);
     281  public void setChanged(boolean isModified);
    257282}
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentList.java

    r7190 r8408  
    5252  }
    5353
    54 
     54  /**
     55   *  Obtain the list of <code>DocumentID</code> objects representing the unique
     56   *  document identifiers of documents that refer to the file given as a parameter.
     57   *
     58   *  @param <code>URL</code> the location of the file to match
     59   *
     60   *  @return <code>List</code> of <code>DocumentID</code> reference identifiers.
     61   */
    5562  public List getDocumentIdsWithFile(URL fileLocation)
    5663  { List reply = new ArrayList();
     
    104111   *  Get a list of documents that match a given set of patterns,
    105112   *  within a given URL node.
     113   *
     114   *  @param <code>List</code> the list of patterns to match
     115   *  @param <code>String</code> the partial URL of the root node under which o
     116   *         match files.  NB: this is a <code>String</code> as the URL may be
     117   *         incomplete and not properly match the strict requirements for <code>URL</code>
    106118   */
    107119  public List findDocumentIdsUsingFiles(List fileRefs, String withinNode)
     
    137149  }
    138150
     151  /**
     152   *  Return a list of document identifiers against a simple pattern.  No root node is given, so
     153   *  any file matching the pattern given will be returned. USE WITH CAUTION!!!
     154   *
     155   *  @param <code>String</code> a fragment of file pathname to match against.
     156   *
     157   *  @return <code>List</code> of <code>DocumentID</code> objects.
     158   */
    139159  public List findDocumentIdsUsingFile(String fileRef)
    140160  {
     
    290310  public void addDocument(DocumentInterface document)
    291311  { // initially, test if the document has a duplicate...
    292     if (document.hasDuplicate(this.connection)) {
     312    String duplicateDocID = document.getDuplicateID(this.connection);
     313    if (duplicateDocID.length() > 0) {
    293314      System.out.println("Found duplicate document ");
    294315      return;
     
    305326
    306327    // add to the database as well, if it is modified...
    307     if (document.isModified()) {
     328    if (document.isChanged()) {
    308329      document.getSQLWriter().writeDocument(document, this.connection);
    309330    }
     
    321342   *  @param <code>DocumentInterface</code> the document
    322343   */
    323   public void modifiedDocument(DocumentInterface document)
     344  public void storeChangedDocument(DocumentInterface document)
    324345  { document.getSQLWriter().writeDocument(document, this.connection);
    325346  }
     
    350371  }
    351372
     373  /**
     374   *  Simple "obtain a document" function
     375   */
    352376  public DocumentInterface getDocument(DocumentID documentId)
    353377  {
     
    360384
    361385  /**
    362   public DocumentID getDocumentID(int index)
    363   { if (index < 0 || index >= this.used)
    364     { return null;
    365     }
    366     return this.list[index].getID();
    367   }
    368   */
    369 
     386   *  Update timestamps on an entire document list - done at the beginning of a build cycle
     387   *
     388   *  @param <code>The date of the new build cycle</code>
     389   */
     390  public void updateTimestamps(java.util.Date time)
     391  { Iterator documents = this.iterator();
     392    int item = 0;
     393
     394    while (documents.hasNext())
     395    { DocumentInterface document = (DocumentInterface) documents.next();
     396     
     397      long thisTimeStamp = document.getFilesDatestamp();
     398      long lastTimeStamp = document.getModifiedDatestamp();
     399
     400      if (thisTimeStamp > lastTimeStamp) {
     401    System.out.println("Updating timestamps " + thisTimeStamp + " " + lastTimeStamp);
     402
     403    DocumentSQLWriter.touchDocument(document.getID(), this.connection, time.getTime());
     404      }
     405    }
     406  }
     407
     408  /**
     409   *  A convenience method to map onto the old Vector source code...
     410   */
    370411  protected void ensureSize(int size)
    371412  { DocumentInterface [] newList = new DocumentInterface[size];
     
    375416  }
    376417
     418  /**
     419   *  Write the documents into a directory as METS/XML
     420   */
    377421  public void writeDocuments(File directory)
    378422  { Iterator documents = this.iterator();
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentSQLWriter.java

    r6697 r8408  
    1313  public DocumentSQLWriter()
    1414  {
     15  }
     16
     17  public static boolean touchDocument(DocumentID docID, GS3SQLConnection connection, long touchTime)
     18  { GS3SQLUpdate update = new GS3SQLUpdate("document");
     19    update.setWhere(new GS3SQLWhere(new GS3SQLWhereItem("DocID", "=", docID.toString())));
     20    update.addDate("IndexedDate", new java.sql.Timestamp(touchTime));
     21    connection.execute(update.toString());
     22    System.out.println(update.toString());
     23    return true;
    1524  }
    1625
     
    3948      insert.addValue("DocID", document.getID().toString());
    4049      insert.addValue("DocType", document.getDocumentType());
    41       // TODO: avoid this terrible cast
    42       insert.addDate("AccessionDate", ((AbstractDocument) document).firstDate);
     50
     51      insert.addDate("AccessionDate", new java.sql.Timestamp(document.getAccessionDate()));
     52      insert.addDate("IndexedDate", new java.sql.Timestamp(document.getLastIndexedDate()));
     53      insert.addDate("ModifiedDate", new java.sql.Timestamp(document.getModifiedDatestamp()));
    4354
    4455      connection.execute(insert.toString());
     56    }
     57    else {
     58      /* redundant code - not used... */
     59      GS3SQLUpdate update = new GS3SQLUpdate("document");
     60      update.setWhere(new GS3SQLWhere(new GS3SQLWhereItem("DocID", "=", document.getID().toString())));
     61      connection.execute(update.toString());
    4562    }
    4663      }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java

    r6379 r8408  
    171171
    172172      System.out.println("Writing modified document " + document.getID());
    173       documentList.modifiedDocument(document);
     173      documentList.storeChangedDocument(document);
    174174    }
    175175      }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java

    r6503 r8408  
    103103
    104104        //  System.out.println("Writing modified document " + document.getID());
    105         documentList.modifiedDocument(document);
     105        documentList.storeChangedDocument(document);
    106106      }
    107107    }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/IndexerManager.java

    r6897 r8408  
    1414  int size;
    1515  int used;
    16   DocumentList documents;
     16  DocumentList documentList;
    1717
    1818  public static final String outputDir = "outputDir";
     
    2424    this.size      = 10;
    2525    this.used      = 0;
    26     this.documents = documentList;
     26    this.documentList = documentList;
    2727  }
    2828
     
    5050          continue;
    5151      }
    52     Iterator iterator = this.documents.iterator();
     52    Iterator iterator = this.documentList.iterator();
    5353
    5454    while (iterator.hasNext()) {
     
    6161       
    6262        // note any changes made to this document...
    63         if (document.isModified()) {
    64           this.documents.modifiedDocument(document);
     63        if (document.isChanged()) {
     64          this.documentList.storeChangedDocument(document);
    6565          //          System.out.println("Writing document "+document.getID());
    6666        }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java

    r7583 r8408  
    229229       
    230230        descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo));
    231         metsDoc.setModified(true);
     231        metsDoc.setChanged(true);
    232232        //  System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
    233233    } // section level
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/metadata/METSFile.java

    r7465 r8408  
    296296
    297297  /**
    298    *
     298   *  Get modified file date
    299299   */
    300300  public long getModifiedDatestamp()
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/GS3SQLConnection.java

    r7306 r8408  
    201201      docTable.addProperty("AccessionDate", GS3SQLField.DATETIME_TYPE);
    202202      docTable.addProperty("IndexedDate", GS3SQLField.DATETIME_TYPE);
     203      docTable.addProperty("ModifiedDate", GS3SQLField.DATETIME_TYPE);
    203204      statement = this.connection.createStatement();
    204205      statement.execute(docTable.toString());
Note: See TracChangeset for help on using the changeset viewer.