Context Navigation

← Previous Changeset
Next Changeset →

Changeset 6013

Timestamp:

2003-11-26T15:36:27+13:00 (20 years ago)

Author:

cs025

Message:

Improved extractors

Location:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor

Files:

: 4 edited

ExtractorInterface.java (modified) (1 diff)
ExtractorManager.java (modified) (1 diff)
GMLExtractor.java (modified) (2 diffs)
IndexExtractor.java (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorInterface.java

-              r5800
+              r6013
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
+import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
 public interface ExtractorInterface
+{
+    public void configure(String outputDir);
+    public void startPass(int passNumber);
+    public void extractDocument(DocumentID documentID, DocumentInterface document);
+    public void endPass(int passNumber);
+    public int getNumberOfPasses();
+  public void configure(String outputDir);
+  public void configure(DocumentList list);
+  public void startPass(int passNumber);
+  public void extractDocument(DocumentID documentID, DocumentInterface document);
+  public void endPass(int passNumber);
+  public int getNumberOfPasses();
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java

r5946	r6013
25	25	this.list[this.used] = extractor;
26	26	this.used ++;
	27
	28	extractor.configure(this.documents);
27	29	}
28	30

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java

-              r5946
+              r6013
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
+import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
 import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
 …
+  }
+  public void configure(DocumentList documentList)
+  { // Intentionally left blank
+  }
   /**
    *  This extractor doesn't need to do any preparation/completion work,

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java

-              r5946
+              r6013
 import java.io.FileReader;
+import java.net.URL;
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Iterator;
 import org.xml.sax.XMLReader;
 …
 import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
+import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
 public class IndexExtractor implements ExtractorInterface
 …
     boolean doneRow;
     List    labels;
+    IndexHandler(String content) throws IndexHandlerException
+    URL     base;
+    IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
     { this.content = content;
       this.doneRow = false;
       this.labels  = new ArrayList();
+      this.base    = url;
+      String parentDir;
+      int leaf = this.base.toString().lastIndexOf('/');
+      if (leaf >= 0) {
+    parentDir = this.base.toString().substring(0, leaf+1);
+      }
+      else {
+    parentDir = this.base.toString();
+      }
       // get the first line
 …
+      }
+      // get the first totem - it should be blank
+      // get the first totem - it should be "key:"
+      String entry = this.getEntry(true);
+      // now get all the labels
+      while (this.hasMore())
+      { String label = this.getEntry(true);
+        if (label == null)
+      continue;
+    label.trim();
+    if (label.length() == 0) {
+      continue;
+    }
+    this.labels.add(label);
+    System.out.println("Adding label: " + label);
+      }
+      while (this.hasMoreLines()) {
+    this.getLine();
+    // Get the file pattern itself
+    String filePattern = this.getEntry(true);
+    if (filePattern == null) {
+      continue;
+    }
+    filePattern.trim();
+    if (filePattern.length() == 0) {
+      continue;
+    }
+    // get a list of documents that match the file pattern
+    List files = documentList.findDocumentIdsUsingFile(filePattern);
+    if (files != null) {
+      Iterator iterator = files.iterator();
+      while (iterator.hasNext()) {
+        System.out.println("Matches file " + iterator.next().toString());
+      }
+    }
+    // if no files match this data, then skip this row
+    // TODO: raise a quality error message
+    if (files == null || files.size() == 0) {
+      continue;
+    }
+    // TODO: cache up the documents that match for speed?
+    // Next, split the row into the separate metadata items
+    int entryNo = 0;
+    while (this.hasMore()) {
+      String item = this.getEntry(true);
+      if (item == null) {
+        entryNo ++;
+        continue;
+      }
+      item.trim();
+      if (item.length() == 0) {
+        entryNo ++;
+        continue;
+      }
+      String label = null;
+      if (item.startsWith("<")) {
+        int labelEnd = item.indexOf('>');
+        if (labelEnd >= 0) {
+          label = item.substring(1, labelEnd);
+          item = item.substring(labelEnd+1, item.length());
+          // eliminate any weird whitespace
+          item.trim();
+          // cope with a solo 'item' label with no following string
+          if (item.length() == 0) {
+        entryNo ++;
+        continue;
+          }
+        }
+        // starts with a bracketed label
+      }
+      else if (entryNo < this.labels.size()) {
+        label = (String) this.labels.get(entryNo);
+      }
+      // Actually post the metadata -
+      // it may be good to have cached all the documents that we're going to change
+      // in order to minimise rewrites...
+      if (label != null) {
+        System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
+        // Post to document
+      }
+      entryNo ++;
+    }
+      }
+    }
 …
     private String getEntry()
+    { int tab = this.line.indexOf('\t');
+      String reply;
+      if (tab < 0) {
+    { return this.getEntry(false);
+    }
+    private String getEntry(boolean breakSpace)
+    { String reply;
+      int start, tab = 0;
+      boolean quoted = false;
+      start = 0;
+      while (start < this.line.length() &&
+         this.line.charAt(start) == ' ') {
+    start ++;
+      }
+      if (start == this.line.length()) {
+    this.line = null;
+    return null;
+      }
+      if (this.line.charAt(start) == '"') {
+    quoted = true;
+    breakSpace = false;
+    start ++;
+      }
+      tab = start;
+      while (tab != this.line.length() &&
+         this.line.charAt(tab) != '\t' &&
+         !(quoted && this.line.charAt(tab) == '"') &&
+             !(this.line.charAt(tab) == ' ' && breakSpace))
+      { tab ++;
+      }
+      if (start > 0) {
+    this.line = this.line.substring(start);
+    tab -= start;
+      }
+      if (tab == this.line.length()) {
     reply = this.line;
     this.line = null;
 …
     private String getLine()
+    { do {
+    { if (this.content == null) {
+    this.line = null;
+    return null;
+      }
+      do {
         int eol = this.content.indexOf('\n');
     if (eol < 0) {
 …
       this.line.trim();
+    }
+      } while (this.line != null && this.line.length() == 0);
+      } while (this.content != null && this.line != null && this.line.length() == 0);
       return this.line;
+    }
+  }
+  private DocumentList documentList;
   /**
 …
   public void configure(String outputDir)
   { // Intentionally left blank
+  }
+  public void configure(DocumentList list)
+  { this.documentList = list;
+  }
 …
       // get the file
+    String documentText = null;
+    //      String documentText =
+    //  DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).toString());
+      String documentText =
+    DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
       if (documentText == null) {
 …
       try {
       IndexHandler handler = new IndexHandler(documentText);
+    IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
+      }
       catch (IndexHandlerException ex) {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 6013

Legend:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorInterface.java

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java

Download in other formats: