greenstone.org greenstone wiki greenstone trac planet greenstone

Changeset 17804

Show
Ignore:
Timestamp:
2008-11-10 20:46:44 (2 months ago)
Author:
davidb
Message:

Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer? to make accent folding of Latin-1 *on*

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java

    r16583 r17804  
    4444import org.apache.lucene.index.IndexWriter; 
    4545import org.apache.lucene.index.Term; 
    46 import org.apache.lucene.analysis.standard.StandardAnalyzer; 
     46import org.apache.lucene.analysis.Analyzer; 
    4747 
    4848import java.util.Stack; 
     
    6161    public static void main (String args[]) throws Exception  
    6262    {  
    63  
    6463        int verbosity = 1; 
    6564        // Default is to edit the existing index 
     
    150149    { 
    151150        IndexWriter writer_   = null; 
     151        Analyzer analyzer_    = null; 
    152152        SAXParser sax_parser_ = null; 
    153153        String doc_tag_level_ = null; 
     
    164164        protected String file_id_ = null; 
    165165 
     166        static private String[] stop_words = GS2Analyzer.STOP_WORDS; 
     167 
    166168        /** pass in true if want to create a new index, false if want to use the existing one */ 
    167169        public Indexer (String doc_tag_level, File index_dir, boolean create)  
     
    177179                reader.setFeature("http://xml.org/sax/features/validation", false); 
    178180 
    179                 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create); 
     181                analyzer_ = new GS2Analyzer(stop_words); 
     182 
     183                writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create); 
    180184                // by default, will only index 10,000 words per document 
    181185                // Can throw out_of_memory errors 
     
    267271                String node_id = atts.getValue("gs2:id"); 
    268272                print(" " + qName + ": " + node_id ); 
    269                 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); 
     273                current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.TOKENIZED)); 
    270274 
    271275                current_doc_oid_ = atts.getValue("gs2:docOID"); 
    272                 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED)); 
     276                current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.TOKENIZED)); 
    273277            } 
    274278 
     
    302306                    if (!qName.equals("TX")) 
    303307                        { 
    304                             current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); 
     308                            current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); 
    305309                        } 
    306310 
     
    310314            if (qName.equals(doc_tag_level_)) { 
    311315                try { 
    312                     writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_); 
     316                    writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_); 
    313317                }  
    314318                catch (java.io.IOException e) { 
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r16947 r17804  
    5959 
    6060    // Use the standard set of English stop words by default 
    61     static private String[] stop_words = StandardAnalyzer.STOP_WORDS; 
     61    static private String[] stop_words = GS2Analyzer.STOP_WORDS; 
    6262 
    6363    private String full_indexdir=""; 
     
    9494        // Create one query parser with the standard set of stop words, and one with none 
    9595 
    96         query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words)); 
    97         query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { })); 
     96        query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words)); 
     97        query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { })); 
    9898    } 
    9999     
     
    134134            query_including_stop_words = query_including_stop_words.rewrite(reader); 
    135135                 
     136            // System.err.println("********* query_string " + query_string + "****"); 
     137 
    136138            Query query = parseQuery(reader, query_parser, query_string, fuzziness); 
    137139            query = query.rewrite(reader);