Changeset 6102


Ignore:
Timestamp:
2003-12-03T09:39:47+13:00 (20 years ago)
Author:
cs025
Message:

Added MetaXMLExtractor (at least temporarily), improved other extractors

Location:
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java

    r6013 r6102  
    1717public class GMLExtractor implements ExtractorInterface
    1818{
     19  public static final String ACCUMULATE_MODE = "accumulate";
     20
    1921  /**
    2022   *  An inner class to handle GML files
    2123   */
    2224  class GMLHandler extends DefaultHandler
    23   { String file;
     25  { List   files;
    2426    String label;
    2527    StringBuffer value;
    2628    boolean inElement;
     29    boolean accumulate;
    2730
    2831    GMLHandler()
     
    3538
    3639    public void startElement(String URI, String localName, String qName, Attributes attributes)
    37     { if (localName.equals("Filename"))
     40    { if (localName.equals("FileName"))
    3841      { this.value = new StringBuffer();
     42      }
     43      else if (localName.equals("FileSet"))
     44      { this.files = new ArrayList();
     45      }
     46      else if (localName.equals("Description"))
     47      {
    3948      }
    4049      else if (localName.equals("Metadata"))
    4150      { this.label = attributes.getValue("name");
    4251        this.value = new StringBuffer();
     52
     53    String mode = attributes.getValue("mode");
     54    this.accumulate = mode.equals(ACCUMULATE_MODE);
    4355      }
    4456    }
    4557
    4658    public void endElement(String URI, String localName, String qName)
    47     { if (localName.equals("Filename"))
    48       { this.file = this.value.toString();
     59    { if (localName.equals("FileName"))
     60      { String file = this.value.toString();
    4961        this.value = null;
     62    this.files.add(file);
     63      }
     64      else if (localName.equals("FileSet"))
     65      { // post the existing files item...
     66      }
     67      else if (localName.equals("Description"))
     68      {
    5069      }
    5170      else if (localName.equals("Metadata"))
    52       { GMLExtractor.postMetadata(this.file, this.label, this.value.toString());
     71      { GMLExtractor.postMetadata(this.files, this.label, this.value.toString());
    5372        this.value = null;
    5473    this.label = null;
     
    5776
    5877    public void characters(char c[], int start, int length)
    59     { if (this.label != null)
     78    { if (this.value != null)
    6079      { String string = new String(c, start, length);
    6180        this.value.append(string);
     
    123142  }
    124143
    125   protected static void postMetadata(String file, String value, String label)
     144  protected static void postMetadata(List files, String value, String label)
    126145  {
     146   
    127147  }
    128148
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java

    r6013 r6102  
    88import java.util.ArrayList;
    99import java.util.Iterator;
     10import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
    1011
    1112import org.xml.sax.XMLReader;
     
    3334   *  An inner class to handle GML files
    3435   */
    35   class IndexHandler
    36   { String  content;
    37     String  line;
    38     int     pos;
    39     boolean doneRow;
    40     List    labels;
     36  class IndexHandler extends GS2TextFileHandler
     37  { List    labels;
    4138    URL     base;
    4239
    4340    IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
    44     { this.content = content;
    45       this.doneRow = false;
     41    {
     42      super(content);
     43
    4644      this.labels  = new ArrayList();
    4745      this.base    = url;
     
    6967      while (this.hasMore())
    7068      { String label = this.getEntry(true);
    71         if (label == null)
    72       continue;
    73     label.trim();
    74 
    75     if (label.length() == 0) {
     69        if (label == null || label.length() == 0) {
    7670      continue;
    7771    }
     
    8680    // Get the file pattern itself
    8781    String filePattern = this.getEntry(true);
    88     if (filePattern == null) {
     82    if (filePattern == null || filePattern.length() == 0) {
    8983      continue;
    9084    }
    9185
    92     filePattern.trim();
    93     if (filePattern.length() == 0) {
    94       continue;
    95     }
    96 
    9786    // get a list of documents that match the file pattern
    98     List files = documentList.findDocumentIdsUsingFile(filePattern);
    99     if (files != null) {
    100       Iterator iterator = files.iterator();
     87    List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
     88    if (documentIds != null) {
     89      Iterator iterator = documentIds.iterator();
    10190      while (iterator.hasNext()) {
    10291        System.out.println("Matches file " + iterator.next().toString());
     
    10695    // if no files match this data, then skip this row
    10796    // TODO: raise a quality error message
    108     if (files == null || files.size() == 0) {
     97    if (documentIds == null || documentIds.size() == 0) {
    10998      continue;
    11099    }
    111100
    112     // TODO: cache up the documents that match for speed?
     101    // cache up the documents that match for speed improvements...
     102    List documents = new ArrayList();
     103    Iterator idIterator = documentIds.iterator();
     104    while (idIterator.hasNext()) {
     105      String docIdString = idIterator.next().toString();
     106      System.out.println(docIdString);
     107      DocumentID docId   = new DocumentID(docIdString);
     108      DocumentInterface document = documentList.getDocument(docId);
     109      if (document != null) {
     110        documents.add(document);
     111      }
     112    }
    113113
    114114    // Next, split the row into the separate metadata items
     
    116116    while (this.hasMore()) {
    117117      String item = this.getEntry(true);
    118       if (item == null) {
    119         entryNo ++;
    120         continue;
    121       }
    122 
    123       item.trim();
    124       if (item.length() == 0) {
     118      if (item == null || item.length() == 0) {
    125119        entryNo ++;
    126120        continue;
     
    154148      // in order to minimise rewrites...
    155149      if (label != null) {
    156         System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
    157         // Post to document
     150        Iterator docIterator = documents.iterator();
     151        while (docIterator.hasNext()) {
     152          DocumentInterface document = (DocumentInterface) docIterator.next();
     153
     154          // Post to document
     155          // TODO: tailor this to posting documents to *sections* as required...
     156          document.addDocumentMetadata(label, item);
     157          System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
     158        }
    158159      }
    159160      entryNo ++;
    160161    }
     162   
     163    // write out the modified documents
     164    // TODO: nicer/more generalised interface for this and related activity in
     165    //       extractor manager (actually, enricher manager);
     166    Iterator docIterator = documents.iterator();
     167    while (docIterator.hasNext()) {
     168      DocumentInterface document = (DocumentInterface) docIterator.next();
     169
     170      documentList.modifiedDocument(document);
     171    }
    161172      }
    162173    }
    163174
    164     private boolean hasMore()
    165     { return this.line != null;
    166     }
    167 
    168     private boolean hasMoreLines()
    169     { return this.content != null;
    170     }
    171 
    172     private String getEntry()
    173     { return this.getEntry(false);
    174     }
    175 
    176     private String getEntry(boolean breakSpace)
    177     { String reply;
    178       int start, tab = 0;
    179       boolean quoted = false;
    180 
    181       start = 0;
    182       while (start < this.line.length() &&
    183          this.line.charAt(start) == ' ') {
    184     start ++;
    185       }
    186 
    187       if (start == this.line.length()) {
    188     this.line = null;
    189     return null;
    190       }
    191 
    192       if (this.line.charAt(start) == '"') {
    193     quoted = true;
    194     breakSpace = false;
    195     start ++;
    196       }
    197       tab = start;
    198 
    199       while (tab != this.line.length() &&
    200          this.line.charAt(tab) != '\t' &&
    201          !(quoted && this.line.charAt(tab) == '"') &&
    202              !(this.line.charAt(tab) == ' ' && breakSpace))
    203       { tab ++;
    204       }
    205 
    206       if (start > 0) {
    207     this.line = this.line.substring(start);
    208     tab -= start;
    209       }
    210 
    211       if (tab == this.line.length()) {
    212     reply = this.line;
    213     this.line = null;
    214       }
    215       else {
    216     reply = this.line.substring(0, tab);
    217     this.line = this.line.substring(tab+1);
    218       }
    219 
    220       return reply;
    221     }
    222 
    223     private String getLine()
    224     { if (this.content == null) {
    225     this.line = null;
    226     return null;
    227       }
    228    
    229       do {
    230         int eol = this.content.indexOf('\n');
    231     if (eol < 0) {
    232       this.line = this.content;
    233       this.content = null;
    234     }
    235     else {
    236       this.line = this.content.substring(0, eol);
    237       this.content = this.content.substring(eol+1);
    238       while (this.content.length() > 0 &&
    239          this.content.charAt(0) < ' ')
    240       { this.content = this.content.substring(1);
    241       }
    242     }
    243 
    244     if (this.line != null) {
    245       this.line.trim();
    246     }
    247       } while (this.content != null && this.line != null && this.line.length() == 0);
    248 
    249       return this.line;
    250     }
    251175  }
    252176
Note: See TracChangeset for help on using the changeset viewer.