Changeset 12377 for trunk/indexers
- Timestamp:
- 2006-08-02T15:07:47+12:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java
r12375 r12377 11 11 import java.io.BufferedReader; 12 12 import java.io.InputStreamReader; 13 import java.util.Collections; 14 import java.util.HashMap; 13 15 import java.util.HashSet; 14 16 import java.util.Iterator; 17 import java.util.Set; 15 18 16 19 import org.apache.lucene.analysis.Analyzer; … … 19 22 import org.apache.lucene.index.IndexReader; 20 23 import org.apache.lucene.index.Term; 24 import org.apache.lucene.index.TermFreqVector; 21 25 import org.apache.lucene.queryParser.QueryParser; 26 import org.apache.lucene.search.Hit; 22 27 import org.apache.lucene.search.Hits; 23 28 import org.apache.lucene.search.IndexSearcher; … … 41 46 42 47 Sort sorter = new Sort(); 48 boolean fuzzy = false; 43 49 44 50 // New code to allow the default conjunction operator to be … … 50 56 { 51 57 i++; 58 ///ystem.err.println("**** sort by = " + args[i]); 52 59 sorter = new Sort(args[i]); 53 60 } … … 57 64 default_conjuction_operator = args[i]; 58 65 } 66 if (args[i].equals("-fuzzy")) 67 { 68 fuzzy = true; 69 } 59 70 } 60 71 … … 76 87 break; 77 88 } 78 System.err.println("**** query = " + query_string); 79 80 // Parse the query and rewrite it into individual terms (eg. for wildcard searches) 89 ///ystem.err.println("**** query = " + query_string); 90 81 91 Query query = query_parser.parse(query_string); 82 92 query = query.rewrite(reader); … … 84 94 query_including_stop_words = query_including_stop_words.rewrite(reader); 85 95 96 // If this is a fuzzy search, then we need to add the fuzzy 97 // flag to each of the query terms 98 if (fuzzy && query.toString().length() > 0) 99 { 100 // Revert the query to a string 101 ///ystem.err.println("Rewritten query: " + query.toString()); 102 // Search through the string for TX:<term> query terms 103 // and append the ~ operator. Not that this search will 104 // not change phrase searches (TX:"<term> <term>") as 105 // fuzzy searching is not possible for these entries. 106 // Yahoo! Time for a state machine! 107 StringBuffer mutable_query_string = new StringBuffer(query.toString()); 108 int o = 0; // Offset 109 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX: 110 int s = 0; // State 111 while(o < mutable_query_string.length()) 112 { 113 char c = mutable_query_string.charAt(o); 114 ///ystem.err.println("SM: in state " + s + " and reading a " + c); 115 if (s == 0 && c == 'T') 116 { 117 ///ystem.err.println("Found T!"); 118 s = 1; 119 } 120 else if (s == 1) 121 { 122 if (c == 'X') 123 { 124 ///ystem.err.println("Found X!"); 125 s = 2; 126 } 127 else 128 { 129 s = 0; // Reset 130 } 131 } 132 else if (s == 2) 133 { 134 if (c == ':') 135 { 136 ///ystem.err.println("Found TX:!"); 137 s = 3; 138 } 139 else 140 { 141 s = 0; // Reset 142 } 143 } 144 else if (s == 3) 145 { 146 // Don't process phrases 147 if (c == '"') 148 { 149 ///ystem.err.println("Stupid phrase..."); 150 s = 0; // Reset 151 } 152 // Found the end of the term... add the 153 // fuzzy search indicator 154 // Nor outside the scope of parentheses 155 if (Character.isWhitespace(c) || c == ')') 156 { 157 ///ystem.err.println("Yahoo! Found fuzzy term."); 158 mutable_query_string.insert(o, '~'); 159 o++; 160 s = 0; // Reset 161 } 162 } 163 o++; 164 } 165 // If we were in the state of looking for the end of a 166 // term - then we just found it! 167 if (s == 3) 168 { 169 mutable_query_string.append('~'); 170 } 171 // Reparse the query 172 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString()); 173 query = query_parser.parse(mutable_query_string.toString()); 174 // And rewrite again 175 query = query.rewrite(reader); 176 ///ystem.err.println("Rewritten Fuzzy query: " + query.toString()); 177 } 178 179 86 180 // Perform the query 87 181 Hits hits = searcher.search(query, sorter); 88 182 System.out.println("<ResultSet>"); 89 183 System.out.println(" <QueryString>" + query_string + "</QueryString>"); 90 91 184 // Return the list of expanded query terms and their frequencies 185 HashMap term_counts = new HashMap(); 186 HashMap term_fields = new HashMap(); 92 187 HashSet terms = new HashSet(); 93 188 query.extractTerms(terms); 94 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>"); 95 Iterator terms_iter = terms.iterator(); 96 while (terms_iter.hasNext()) { 97 Term term = (Term) terms_iter.next(); 98 System.out.println(" <Term value=\"" + term.text() + "\" freq=\"" + reader.docFreq(term) + "\" field=\"" + term.field() + "\"/>"); 99 } 189 //System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>"); 190 Iterator iter = terms.iterator(); 191 while (iter.hasNext()) 192 { 193 Term term = (Term) iter.next(); 194 // If you wanted to limit this to just TX terms add 195 // something like this: 196 //if (term.field().equals("TX")) 197 term_counts.put(term.text(), new Integer(0)); 198 term_fields.put(term.text(), term.field()); 199 } 200 201 // Do we need to use a hit iterator to get sorted results? 202 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 203 Iterator hit_iter = hits.iterator(); 204 while (hit_iter.hasNext()) 205 { 206 Hit hit = (Hit) hit_iter.next(); 207 Document doc = hit.getDocument(); 208 String node_id = doc.get("nodeID"); 209 System.out.println(" <Match id=\"" + node_id + "\" />"); 210 211 // From the document, extract the Term Vector for the 212 // TX field 213 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX"); 214 if (term_freq_vector.size() > 0) 215 { 216 int[] term_frequencies = term_freq_vector.getTermFrequencies(); 217 // Now for each query term, determine the 218 // frequency - which may of course be 0. 219 Set term_counts_set = term_counts.keySet(); 220 Iterator terms_iter = term_counts_set.iterator(); 221 while (terms_iter.hasNext()) 222 { 223 String term = (String) terms_iter.next(); 224 Integer count_integer = (Integer) term_counts.get(term); 225 int count = count_integer.intValue(); 226 int index = term_freq_vector.indexOf(term); 227 // If the term has a count, then add to 228 // the total count for this term 229 if (index != -1) 230 { 231 count += term_frequencies[index]; 232 233 } 234 // Store the result 235 term_counts.put(term, new Integer(count)); 236 count_integer = null; 237 term = null; 238 } 239 terms_iter = null; 240 term_counts_set = null; 241 } 242 else 243 { 244 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 245 } 246 } 247 248 // Retrieve all the useful terms 249 Set term_counts_set = term_counts.keySet(); 250 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>"); 251 // Iterate over them 252 Iterator terms_iter = term_counts_set.iterator(); 253 while (terms_iter.hasNext()) 254 { 255 String term = (String) terms_iter.next(); 256 Integer count = (Integer) term_counts.get(term); 257 String field = (String) term_fields.get(term); 258 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />"); 259 count = null; 260 term = null; 261 } 262 // Cleanup 263 terms_iter = null; 264 term_counts_set = null; 100 265 101 266 // Return the list of stop words removed from the query 102 267 HashSet terms_including_stop_words = new HashSet(); 103 268 query_including_stop_words.extractTerms(terms_including_stop_words); 104 System.out.println(" <StopWordsInfo num=\"" + (terms_including_stop_words.size() - terms.size()) + "\"/>");105 269 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator(); 106 270 while (terms_including_stop_words_iter.hasNext()) { … … 111 275 } 112 276 113 // Return the matching documents 114 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 115 for (int i = 0; i < hits.length(); i++) { 116 Document doc = hits.doc(i); 117 String node_id = doc.get("nodeID"); 118 System.out.println(" <Match id=\"" + node_id + "\"/>"); 119 } 120 121 System.out.println("</ResultSet>"); 277 System.out.println("</ResultSet>"); 122 278 } 123 279
Note:
See TracChangeset
for help on using the changeset viewer.