Context Navigation

← Previous Change
Next Change →

Changeset 17804 for indexers

Timestamp:

2008-11-10T20:46:44+13:00 (15 years ago)

Author:

davidb

Message:

Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 *on*

Location:

indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper

Files:

: 1 added
: 2 edited

GS2Analyzer.java (added)
GS2LuceneIndexer.java (modified) (8 diffs)
GS2LuceneQuery.java (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java

-              r16583
+              r17804
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
 import java.util.Stack;
 …
     public static void main (String args[]) throws Exception
+    {
     int verbosity = 1;
     // Default is to edit the existing index
 …
+    {
     IndexWriter writer_   = null;
+    Analyzer analyzer_    = null;
     SAXParser sax_parser_ = null;
     String doc_tag_level_ = null;
 …
     protected String file_id_ = null;
+    static private String[] stop_words = GS2Analyzer.STOP_WORDS;
     /** pass in true if want to create a new index, false if want to use the existing one */
     public Indexer (String doc_tag_level, File index_dir, boolean create)
 …
         reader.setFeature("http://xml.org/sax/features/validation", false);
+        writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
+        analyzer_ = new GS2Analyzer(stop_words);
+        writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
         // by default, will only index 10,000 words per document
         // Can throw out_of_memory errors
 …
         String node_id = atts.getValue("gs2:id");
         print(" " + qName + ": " + node_id );
         current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
+        current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.TOKENIZED));
         current_doc_oid_ = atts.getValue("gs2:docOID");
         current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
+        current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.TOKENIZED));
+        }
 …
             if (!qName.equals("TX"))
+            {
                 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
+                current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
+            }
 …
         if (qName.equals(doc_tag_level_)) {
         try {
             writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
+            writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
+        }
         catch (java.io.IOException e) {

indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

-              r16947
+              r17804
     // Use the standard set of English stop words by default
     static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
+    static private String[] stop_words = GS2Analyzer.STOP_WORDS;
     private String full_indexdir="";
 …
     // Create one query parser with the standard set of stop words, and one with none
     query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
         query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
+    query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
+        query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
+    }
 …
         query_including_stop_words = query_including_stop_words.rewrite(reader);
+        // System.err.println("********* query_string " + query_string + "****");
         Query query = parseQuery(reader, query_parser, query_string, fuzziness);
         query = query.rewrite(reader);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 17804 for indexers

Legend:

indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java

indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

Download in other formats: