Changeset 17804


Ignore:
Timestamp:
2008-11-10T20:46:44+13:00 (16 years ago)
Author:
davidb
Message:

Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 *on*

Location:
indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java

    r16583 r17804  
    4444import org.apache.lucene.index.IndexWriter;
    4545import org.apache.lucene.index.Term;
    46 import org.apache.lucene.analysis.standard.StandardAnalyzer;
     46import org.apache.lucene.analysis.Analyzer;
    4747
    4848import java.util.Stack;
     
    6161    public static void main (String args[]) throws Exception
    6262    {
    63 
    6463    int verbosity = 1;
    6564    // Default is to edit the existing index
     
    150149    {
    151150    IndexWriter writer_   = null;
     151    Analyzer analyzer_    = null;
    152152    SAXParser sax_parser_ = null;
    153153    String doc_tag_level_ = null;
     
    164164    protected String file_id_ = null;
    165165
     166    static private String[] stop_words = GS2Analyzer.STOP_WORDS;
     167
    166168    /** pass in true if want to create a new index, false if want to use the existing one */
    167169    public Indexer (String doc_tag_level, File index_dir, boolean create)
     
    177179        reader.setFeature("http://xml.org/sax/features/validation", false);
    178180
    179         writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
     181        analyzer_ = new GS2Analyzer(stop_words);
     182
     183        writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
    180184        // by default, will only index 10,000 words per document
    181185        // Can throw out_of_memory errors
     
    267271        String node_id = atts.getValue("gs2:id");
    268272        print(" " + qName + ": " + node_id );
    269         current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
     273        current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.TOKENIZED));
    270274
    271275        current_doc_oid_ = atts.getValue("gs2:docOID");
    272         current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
     276        current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.TOKENIZED));
    273277        }
    274278
     
    302306            if (!qName.equals("TX"))
    303307            {
    304                 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
     308                current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
    305309            }
    306310
     
    310314        if (qName.equals(doc_tag_level_)) {
    311315        try {
    312             writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
     316            writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
    313317        }
    314318        catch (java.io.IOException e) {
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r16947 r17804  
    5959
    6060    // Use the standard set of English stop words by default
    61     static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
     61    static private String[] stop_words = GS2Analyzer.STOP_WORDS;
    6262
    6363    private String full_indexdir="";
     
    9494    // Create one query parser with the standard set of stop words, and one with none
    9595
    96     query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
    97         query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
     96    query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
     97        query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
    9898    }
    9999   
     
    134134        query_including_stop_words = query_including_stop_words.rewrite(reader);
    135135       
     136        // System.err.println("********* query_string " + query_string + "****");
     137
    136138        Query query = parseQuery(reader, query_parser, query_string, fuzziness);
    137139        query = query.rewrite(reader);
Note: See TracChangeset for help on using the changeset viewer.