Changeset 12975
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java
r12846 r12975 20 20 import org.apache.lucene.index.IndexReader; 21 21 import org.apache.lucene.index.Term; 22 import org.apache.lucene.index.Term FreqVector;22 import org.apache.lucene.index.TermDocs; 23 23 import org.apache.lucene.queryParser.ParseException; 24 24 import org.apache.lucene.queryParser.QueryParser; … … 57 57 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { })); 58 58 59 Sort sorter = n ew Sort();59 Sort sorter = null; 60 60 Filter filter = null; 61 61 String fuzziness = null; … … 75 75 if (args[i].equals("-filter")) { 76 76 i++; 77 78 // Parse up filter79 77 filter = parseFilterString(args[i]); 80 78 } … … 127 125 query = query.rewrite(reader); 128 126 129 // Perform the query 130 Hits hits; 131 if (filter != null) { 132 hits = searcher.search(query, filter, sorter); 133 } 134 else { 135 hits = searcher.search(query, sorter); 136 } 137 138 // Return the list of expanded query terms and their frequencies 139 HashMap term_counts = new HashMap(); 140 HashMap term_fields = new HashMap(); 141 HashSet terms = new HashSet(); 142 query.extractTerms(terms); 143 Iterator iter = terms.iterator(); 144 while (iter.hasNext()) { 145 Term term = (Term) iter.next(); 146 // If you wanted to limit this to just text terms add 147 // something like this: 148 //if (term.field().equals(TEXTFIELD)) 149 term_counts.put(term.text(), new Integer(0)); 150 term_fields.put(term.text(), term.field()); 151 } 127 // Perform the query (filter and sorter may be null) 128 Hits hits = searcher.search(query, filter, sorter); 152 129 153 130 // Do we need to use a hit iterator to get sorted results? … … 173 150 } 174 151 // And skip all the rest 175 176 // From the document, extract the Term Vector for the 177 // text field 178 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD); 179 if (term_freq_vector != null && term_freq_vector.size() > 0) { 180 int[] term_frequencies = term_freq_vector.getTermFrequencies(); 181 // Now for each query term, determine the 182 // frequency - which may of course be 0. 183 Set term_counts_set = term_counts.keySet(); 184 Iterator terms_iter = term_counts_set.iterator(); 185 while (terms_iter.hasNext()) { 186 187 String term = (String) terms_iter.next(); 188 Integer count_integer = (Integer) term_counts.get(term); 189 int count = count_integer.intValue(); 190 int index = term_freq_vector.indexOf(term); 191 // If the term has a count, then add to 192 // the total count for this term 193 if (index != -1) { 194 count += term_frequencies[index]; 195 } 196 // Store the result 197 term_counts.put(term, new Integer(count)); 198 count_integer = null; 199 term = null; 200 } 201 terms_iter = null; 202 term_counts_set = null; 152 153 ++counter; 154 } 155 156 // Return the list of expanded query terms and their frequencies 157 HashSet terms = new HashSet(); 158 query.extractTerms(terms); 159 Iterator term_iterator = terms.iterator(); 160 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>"); 161 while (term_iterator.hasNext()) { 162 Term term = (Term) term_iterator.next(); 163 164 // Get the term frequency over all the documents 165 TermDocs term_docs = reader.termDocs(term); 166 int term_freq = term_docs.freq(); 167 while (term_docs.next()) { 168 term_freq += term_docs.freq(); 203 169 } 204 else { 205 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 206 } 207 ++counter; 170 171 // If you wanted to limit this to just text terms add 172 // something like this: 173 // if (term.field().equals(TEXTFIELD)) 174 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />"); 208 175 } 209 176 210 // Retrieve all the useful terms211 Set term_counts_set = term_counts.keySet();212 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");213 // Iterate over them214 Iterator terms_iter = term_counts_set.iterator();215 while (terms_iter.hasNext()) {216 String term = (String) terms_iter.next();217 Integer count = (Integer) term_counts.get(term);218 String field = (String) term_fields.get(term);219 220 // Ignore any terms with zero frequency, because they don't exist in the matching221 // documents. It seems that this should never happen, but it's a consequence of222 // how the terms are identified. The terms are found by rewriting the query (above).223 // At this point, the query hasn't been run, so each query term is expanded without224 // knowing whether the expanded term will actually appear in one of the resulting225 // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if226 // the search is for "otago AND auckland", no matching documents may include "otaio".227 // Hopefully that made some sense...228 if (count.intValue() > 0) {229 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");230 }231 count = null;232 term = null;233 }234 235 // Cleanup236 terms_iter = null;237 term_counts_set = null;238 239 177 // Return the list of stop words removed from the query 240 178 HashSet terms_including_stop_words = new HashSet(); -
trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java
r12846 r12975 20 20 import org.apache.lucene.index.IndexReader; 21 21 import org.apache.lucene.index.Term; 22 import org.apache.lucene.index.Term FreqVector;22 import org.apache.lucene.index.TermDocs; 23 23 import org.apache.lucene.queryParser.ParseException; 24 24 import org.apache.lucene.queryParser.QueryParser; … … 57 57 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { })); 58 58 59 Sort sorter = n ew Sort();59 Sort sorter = null; 60 60 Filter filter = null; 61 61 String fuzziness = null; … … 75 75 if (args[i].equals("-filter")) { 76 76 i++; 77 78 // Parse up filter79 77 filter = parseFilterString(args[i]); 80 78 } … … 127 125 query = query.rewrite(reader); 128 126 129 // Perform the query 130 Hits hits; 131 if (filter != null) { 132 hits = searcher.search(query, filter, sorter); 133 } 134 else { 135 hits = searcher.search(query, sorter); 136 } 137 138 // Return the list of expanded query terms and their frequencies 139 HashMap term_counts = new HashMap(); 140 HashMap term_fields = new HashMap(); 141 HashSet terms = new HashSet(); 142 query.extractTerms(terms); 143 Iterator iter = terms.iterator(); 144 while (iter.hasNext()) { 145 Term term = (Term) iter.next(); 146 // If you wanted to limit this to just text terms add 147 // something like this: 148 //if (term.field().equals(TEXTFIELD)) 149 term_counts.put(term.text(), new Integer(0)); 150 term_fields.put(term.text(), term.field()); 151 } 127 // Perform the query (filter and sorter may be null) 128 Hits hits = searcher.search(query, filter, sorter); 152 129 153 130 // Do we need to use a hit iterator to get sorted results? … … 173 150 } 174 151 // And skip all the rest 175 176 // From the document, extract the Term Vector for the 177 // text field 178 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD); 179 if (term_freq_vector != null && term_freq_vector.size() > 0) { 180 int[] term_frequencies = term_freq_vector.getTermFrequencies(); 181 // Now for each query term, determine the 182 // frequency - which may of course be 0. 183 Set term_counts_set = term_counts.keySet(); 184 Iterator terms_iter = term_counts_set.iterator(); 185 while (terms_iter.hasNext()) { 186 187 String term = (String) terms_iter.next(); 188 Integer count_integer = (Integer) term_counts.get(term); 189 int count = count_integer.intValue(); 190 int index = term_freq_vector.indexOf(term); 191 // If the term has a count, then add to 192 // the total count for this term 193 if (index != -1) { 194 count += term_frequencies[index]; 195 } 196 // Store the result 197 term_counts.put(term, new Integer(count)); 198 count_integer = null; 199 term = null; 200 } 201 terms_iter = null; 202 term_counts_set = null; 152 153 ++counter; 154 } 155 156 // Return the list of expanded query terms and their frequencies 157 HashSet terms = new HashSet(); 158 query.extractTerms(terms); 159 Iterator term_iterator = terms.iterator(); 160 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>"); 161 while (term_iterator.hasNext()) { 162 Term term = (Term) term_iterator.next(); 163 164 // Get the term frequency over all the documents 165 TermDocs term_docs = reader.termDocs(term); 166 int term_freq = term_docs.freq(); 167 while (term_docs.next()) { 168 term_freq += term_docs.freq(); 203 169 } 204 else { 205 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 206 } 207 ++counter; 170 171 // If you wanted to limit this to just text terms add 172 // something like this: 173 // if (term.field().equals(TEXTFIELD)) 174 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />"); 208 175 } 209 176 210 // Retrieve all the useful terms211 Set term_counts_set = term_counts.keySet();212 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");213 // Iterate over them214 Iterator terms_iter = term_counts_set.iterator();215 while (terms_iter.hasNext()) {216 String term = (String) terms_iter.next();217 Integer count = (Integer) term_counts.get(term);218 String field = (String) term_fields.get(term);219 220 // Ignore any terms with zero frequency, because they don't exist in the matching221 // documents. It seems that this should never happen, but it's a consequence of222 // how the terms are identified. The terms are found by rewriting the query (above).223 // At this point, the query hasn't been run, so each query term is expanded without224 // knowing whether the expanded term will actually appear in one of the resulting225 // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if226 // the search is for "otago AND auckland", no matching documents may include "otaio".227 // Hopefully that made some sense...228 if (count.intValue() > 0) {229 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");230 }231 count = null;232 term = null;233 }234 235 // Cleanup236 terms_iter = null;237 term_counts_set = null;238 239 177 // Return the list of stop words removed from the query 240 178 HashSet terms_including_stop_words = new HashSet();
Note:
See TracChangeset
for help on using the changeset viewer.