Changeset 5943
- Timestamp:
- 2003-11-24T14:25:41+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/Build.java
r5798 r5943 6 6 public class Build 7 7 { 8 8 BuildManager manager; 9 9 10 public Build(List inputRoots, String collection, String outputDir) 11 { this.manager = new BuildManager(inputRoots, collection, outputDir); 10 public Build(List inputRoots, String collection, String outputDir) 11 { this.manager = new BuildManager(inputRoots, collection, outputDir); 12 } 13 14 public void run() 15 { this.manager.run(); 16 } 17 18 public static void main(String args[]) 19 { int a = 0; 20 List inputDirs = new ArrayList(); 21 String collection = null; 22 String outputDir = null; 23 24 while (a < args.length) 25 { System.out.println(args[a]); 26 if (args[a].equals("-inputdir")) { 27 if (a < args.length - 1 && 28 args[a+1].charAt(0) != '-') 29 { 30 inputDirs.add(args[a+1]); 31 a ++; 12 32 } 33 } 34 else if (args[a].equals("-collect")) { 35 if (a < args.length - 1 && 36 args[a+1].charAt(0) != '-') 37 { 38 collection = args[a+1]; 39 a ++; 40 } 41 } 42 else if (args[a].equals("-outputDir")) { 43 if (a < args.length - 1 && 44 args[a+1].charAt(0) != '-') 45 { outputDir = args[a+1]; 46 a ++; 47 } 48 } 49 a ++; 50 } 13 51 14 public void run() 15 { this.manager.run(); 16 } 52 if (collection == null && inputDirs.size() == 0) 53 { System.err.println("At least one input directory must be given, or a collection name"); 54 System.exit(0); 55 } 17 56 18 public static void main(String args[]) 19 { int a = 0; 20 List inputDirs = new ArrayList(); 21 String collection = null; 22 String outputDir = null; 57 Build build = new Build(inputDirs, collection, outputDir); 58 build.run(); 23 59 24 while (a < args.length) 25 { System.out.println(args[a]); 26 if (args[a].equals("-inputdir")) { 27 if (a < args.length - 1 && 28 args[a+1].charAt(0) != '-') 29 { 30 inputDirs.add(args[a+1]); 31 a ++; 32 } 33 } 34 else if (args[a].equals("-collect")) { 35 if (a < args.length - 1 && 36 args[a+1].charAt(0) != '-') 37 { 38 collection = args[a+1]; 39 a ++; 40 } 41 } 42 else if (args[a].equals("-outputDir")) { 43 if (a < args.length - 1 && 44 args[a+1].charAt(0) != '-') 45 { outputDir = args[a+1]; 46 a ++; 47 } 48 } 49 a ++; 50 } 51 52 if (collection == null && inputDirs.size() == 0) 53 { System.err.println("At least one input directory must be given, or a collection name"); 54 System.exit(0); 55 } 56 57 Build build = new Build(inputDirs, collection, outputDir); 58 build.run(); 59 60 return; 61 } 60 return; 61 } 62 62 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/BuildManager.java
r5823 r5943 4 4 import org.greenstone.gsdl3.gs3build.indexers.*; 5 5 import org.greenstone.gsdl3.gs3build.extractor.*; 6 import org.greenstone.gsdl3.gs3build.classifier.*; 6 7 7 8 import java.io.File; … … 13 14 RecogniserManager recogniserManager; 14 15 IndexerManager indexerManager; 16 ClassifierManager classifierManager; 15 17 ExtractorManager extractorManager; 16 18 DocumentList docList; … … 18 20 String outputDir; 19 21 20 21 22 23 22 public BuildManager(List inputRoots, String collection, String outputDir) 23 { 24 // get the collection configuration information 25 this.collectionManager = new CollectionManager(collection); 24 26 25 // set up the list of input directories 26 if (inputRoots.size() == 0) { 27 inputRoots.add(this.collectionManager.getImportDirectory()); 28 System.out.println("Adding collection import directory to input list"); 29 } 30 this.docList = new DocumentList(new DocumentIDFactory(this.collectionManager)); 31 this.inputRoots = inputRoots; 27 // set up the list of input directories 28 if (inputRoots.size() == 0) { 29 inputRoots.add(this.collectionManager.getImportDirectory()); 30 System.out.println("Adding collection import directory to input list " + this.collectionManager.getImportDirectory()); 31 } 32 this.docList = new DocumentList(new DocumentIDFactory(this.collectionManager), 33 collectionManager.getDatabase()); 34 this.inputRoots = inputRoots; 35 36 // set up the output directory 37 this.outputDir = outputDir != null ? outputDir : this.collectionManager.getBuildDirectory(); 38 File outputDirFile = new File(this.outputDir); 39 if (!outputDirFile.exists() || !outputDirFile.isDirectory()) 40 { System.err.println("Unable to open directory " + this.outputDir + " for writing"); 41 System.exit(1); 42 } 43 System.out.println("Output directory set to " + this.outputDir); 32 44 33 // set up the output directory 34 this.outputDir = outputDir != null ? outputDir : this.collectionManager.getBuildDirectory(); 35 File outputDirFile = new File(this.outputDir); 36 if (!outputDirFile.exists() || !outputDirFile.isDirectory()) 37 { System.err.println("Unable to open directory " + this.outputDir + " for writing"); 38 System.exit(1); 39 } 40 System.out.println("Output directory set to " + this.outputDir); 45 // TODO: initialise the expanders here... 41 46 42 // initialise any recognisers 43 this.recogniserManager = new RecogniserManager(); 44 RecogniserInterface recogniser = new HTMLRecogniser(docList); 45 recogniserManager.addRecogniser(recogniser); 46 recogniser = new TextRecogniser(docList); 47 recogniserManager.addRecogniser(recogniser); 48 49 // set up the extractors 50 this.extractorManager = new ExtractorManager(this.docList); 47 // initialise any recognisers 48 this.recogniserManager = new RecogniserManager(); 49 RecogniserInterface recogniser = new HTMLRecogniser(docList); 50 recogniserManager.addRecogniser(recogniser); 51 recogniser = new TextRecogniser(docList); 52 recogniserManager.addRecogniser(recogniser); 51 53 52 // set up the enrichers 54 // set up the extractors 55 this.extractorManager = new ExtractorManager(this.docList); 56 57 // TODO: set up the enrichers... 58 59 // set up the classifiers 60 this.classifierManager = new ClassifierManager(this.docList); 61 62 // set up the indexers 63 this.indexerManager = new IndexerManager(this.docList); 64 IndexerInterface iface = new MGIndexer(); 65 iface.configure(this.outputDir); 66 this.indexerManager.addIndexer(iface); 67 } 68 69 public String getLastBuildSequence() 70 { return this.collectionManager.getCollectionMetadata("gsdl3", "buildsequence"); 71 } 53 72 54 // set up the classifiers 73 public void run() 74 { 75 this.collectionManager.startBuild(); 55 76 56 // set up the indexers 57 this.indexerManager = new IndexerManager(this.docList); 58 IndexerInterface iface = new MGIndexer(); 59 iface.configure(this.outputDir); 60 this.indexerManager.addIndexer(iface); 77 for (int i = 0; i < this.inputRoots.size(); i ++) 78 { FileCrawler fileCrawler = new FileCrawler(new File((String) this.inputRoots.get(i)), recogniserManager); 79 80 fileCrawler.crawl(); 61 81 } 82 this.indexerManager.indexDocuments(); 62 83 63 public String getLastBuildSequence() 64 { return this.collectionManager.getCollectionMetadata("gsdl3", "buildsequence"); 65 } 84 this.docList.writeDocuments(new File(outputDir)); 85 // this.docList.writeSQLDocuments(this.collectionManager.getDatabase()); 86 87 this.collectionManager.setCollectionMetadata("gsdl3", "documentCount", Integer.toString(this.docList.getCount())); 66 88 67 public void run() 68 { 69 this.collectionManager.startBuild(); 70 71 for (int i = 0; i < this.inputRoots.size(); i ++) 72 { FileCrawler fileCrawler = new FileCrawler(new File((String) this.inputRoots.get(i)), recogniserManager); 73 74 fileCrawler.crawl(); 75 } 76 this.indexerManager.indexDocuments(); 77 78 this.docList.writeDocuments(new File(outputDir)); 79 this.docList.writeSQLDocuments(this.collectionManager.getDatabase()); 80 81 this.collectionManager.endBuild(); 82 } 83 84 public void addInputDirectory(String inputDir) 85 { this.inputRoots.add(inputDir); 86 } 89 this.collectionManager.endBuild(); 90 } 91 92 public void addInputDirectory(String inputDir) 93 { this.inputRoots.add(inputDir); 94 } 87 95 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/CollectionManager.java
r5798 r5943 4 4 import java.util.Calendar; 5 5 import java.util.GregorianCalendar; 6 6 7 import java.io.File; 7 8 import java.io.IOException; … … 50 51 { String collectRoot = System.getProperty("GSDL3HOME"); 51 52 52 this.database = GS3SQLConnectionFactory.createConnection(); 53 this.database.initCollection(collection); 53 this.database = GS3SQLConnectionFactory.createConnection(collection); 54 if (this.database == null) { 55 this.database = GS3SQLConnectionFactory.createConnection("test"); 56 this.database.initCollection(collection); 57 } 58 59 this.metadata = new CollectionMetadata(); 54 60 55 61 if (collectRoot == null) … … 131 137 } 132 138 133 public String getImportDirectory() 134 { return this.collectionHome + "/import"; 135 } 136 137 public String getBuildDirectory() 138 { return this.collectionHome + "/building"; 139 } 140 141 public GS3SQLConnection getDatabase() 142 { 143 return this.database; 144 } 145 146 public void startBuild() 147 { GregorianCalendar today = new GregorianCalendar(); 148 149 if (this.lastBuildDate != null) 150 { // if the build date is different to the last build date, then reset the build 151 // document number 152 if (today.get(Calendar.YEAR) != this.lastBuildDate.get(Calendar.YEAR) || 153 today.get(Calendar.MONTH) != this.lastBuildDate.get(Calendar.MONTH) || 154 today.get(Calendar.DAY_OF_MONTH) != this.lastBuildDate.get(Calendar.DAY_OF_MONTH)) 155 { this.buildDocNo = 1; 156 } 157 } 158 this.lastBuildDate = today; 159 } 160 161 public void endBuild() 162 { 163 } 164 165 public String getNextDocumentID() 166 { StringBuffer ID = new StringBuffer(); 167 168 int value; 169 ID.append(lastBuildDate.get(Calendar.YEAR)); 170 171 // the use of month is a little odd, hence the following 172 // code. Calendar.MONTH yields 0 = January, 1 = February, 173 // etc. hence there is a '+1' added to the month to make 174 // it into January = 1, etc., and the padding is altered 175 // correspondingly. 176 value = lastBuildDate.get(Calendar.MONTH); 177 if (value < 9) 178 { ID.append("0"); 179 } 180 ID.append(value + 1); 181 value = lastBuildDate.get(Calendar.DAY_OF_MONTH); 182 if (value < 10) 183 ID.append("0"); 184 ID.append(value); 185 186 187 value = this.buildDocNo; 188 this.buildDocNo ++; 189 190 ID.append("."); 191 ID.append(Integer.toString(value)); 192 return ID.toString(); 193 } 194 195 public int getDocumentNumber() 196 { this.buildDocNo ++; 197 return this.buildDocNo - 1; 198 } 199 200 /** 201 * Get the collection metadata item in the given namespace 202 * 203 * @param <code>String</code> the namespace 204 * @param <code>String</code> the label of the metadata 205 */ 206 public String getCollectionMetadata(String namespace, String label) 207 { return this.metadata.getCollectionMetadata(namespace, label).get(0).toString(); 208 } 209 210 /** 211 * Set the collection metadata item in the given namespace 212 * 213 * @param <code>String</code> the namespace 214 * @param <code>String</code> the label 215 * @param <code>String</code> the value 216 */ 217 public void setCollectionMetadata(String namespace, String label, String value) 218 { this.metadata.setCollectionMetadata(namespace, label, value); 219 } 139 public String getImportDirectory() 140 { return this.collectionHome + "/import"; 141 } 142 143 public String getBuildDirectory() 144 { return this.collectionHome + "/building"; 145 } 146 147 public GS3SQLConnection getDatabase() 148 { 149 return this.database; 150 } 151 152 public void startBuild() 153 { GregorianCalendar today = new GregorianCalendar(); 154 155 if (this.lastBuildDate != null) 156 { // if the build date is different to the last build date, then reset the build 157 // document number 158 if (today.get(Calendar.YEAR) != this.lastBuildDate.get(Calendar.YEAR) || 159 today.get(Calendar.MONTH) != this.lastBuildDate.get(Calendar.MONTH) || 160 today.get(Calendar.DAY_OF_MONTH) != this.lastBuildDate.get(Calendar.DAY_OF_MONTH)) 161 { this.buildDocNo = 1; 162 } 163 } 164 this.lastBuildDate = today; 165 } 166 167 public void endBuild() 168 { 169 Date startDate = this.lastBuildDate.getTime(); 170 Date date = new Date(); 171 172 long startTime = startDate.getTime(); 173 long endTime = date.getTime(); 174 175 long difference = ((endTime - startTime) + 500) / 1000; 176 177 System.out.println("Build completed"); 178 System.out.println("---------------"); 179 System.out.println("Total Documents: " + this.getCollectionMetadata("gsdl3", "documentCount")); 180 System.out.println("Total Time : " + (difference / 60) + " min. " + (difference % 60) + " secs."); 181 } 182 183 public String getNextDocumentID() 184 { StringBuffer ID = new StringBuffer(); 185 186 int value; 187 ID.append(lastBuildDate.get(Calendar.YEAR)); 188 189 // the use of month is a little odd, hence the following 190 // code. Calendar.MONTH yields 0 = January, 1 = February, 191 // etc. hence there is a '+1' added to the month to make 192 // it into January = 1, etc., and the padding is altered 193 // correspondingly. 194 value = lastBuildDate.get(Calendar.MONTH); 195 if (value < 9) 196 { ID.append("0"); 197 } 198 ID.append(value + 1); 199 value = lastBuildDate.get(Calendar.DAY_OF_MONTH); 200 if (value < 10) 201 ID.append("0"); 202 ID.append(value); 203 204 205 value = this.buildDocNo; 206 this.buildDocNo ++; 207 208 ID.append("."); 209 ID.append(Integer.toString(value)); 210 return ID.toString(); 211 } 212 213 public int getDocumentNumber() 214 { this.buildDocNo ++; 215 return this.buildDocNo - 1; 216 } 217 218 /** 219 * Get the collection metadata item in the given namespace 220 * 221 * @param <code>String</code> the namespace 222 * @param <code>String</code> the label of the metadata 223 */ 224 public String getCollectionMetadata(String namespace, String label) 225 { return this.metadata.getCollectionMetadata(namespace, label).get(0).toString(); 226 } 227 228 /** 229 * Set the collection metadata item in the given namespace 230 * 231 * @param <code>String</code> the namespace 232 * @param <code>String</code> the label 233 * @param <code>String</code> the value 234 */ 235 public void setCollectionMetadata(String namespace, String label, String value) 236 { this.metadata.setCollectionMetadata(namespace, label, value); 237 } 220 238 } 239 -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/FileCrawler.java
r5798 r5943 6 6 { 7 7 File root; 8 8 FileCrawlObserver observer; 9 9 10 11 12 13 10 public FileCrawler(File file, FileCrawlObserver observer) 11 { this.root = file; 12 this.observer = observer; 13 } 14 14 15 16 17 15 public void crawl() 16 { this.crawlDirectory(this.root); 17 } 18 18 19 20 21 19 public boolean crawlDirectory(File file) 20 { 21 File [] children = file.listFiles(); 22 22 23 24 25 23 if (children == null) { 24 return true; 25 } 26 26 27 28 29 30 31 32 33 34 35 36 37 38 27 for (int c = 0; c < children.length; c ++) { 28 if (children[c].isDirectory()) { 29 crawlDirectory(children[c]); 30 } 31 else 32 { 33 // TODO: process file 34 this.observer.processFile(children[c]); 35 } 36 } 37 return true; 38 } 39 39 }
Note:
See TracChangeset
for help on using the changeset viewer.