Changeset 12418
- Timestamp:
- 2006-08-09T10:41:39+12:00 (18 years ago)
- Location:
- trunk
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java
r12415 r12418 11 11 import java.io.BufferedReader; 12 12 import java.io.InputStreamReader; 13 import java.io.IOException; 13 14 import java.util.Collections; 14 15 import java.util.HashMap; … … 23 24 import org.apache.lucene.index.Term; 24 25 import org.apache.lucene.index.TermFreqVector; 26 import org.apache.lucene.queryParser.ParseException; 25 27 import org.apache.lucene.queryParser.QueryParser; 28 import org.apache.lucene.search.BooleanQuery.TooManyClauses; 26 29 import org.apache.lucene.search.Filter; 27 30 import org.apache.lucene.search.Hit; … … 39 42 { 40 43 if (args.length == 0) { 41 System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");44 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzzy] [-filter filter_string] [-sort sort_field] [-dco AND|OR]"); 42 45 return; 43 46 } … … 59 62 { 60 63 i++; 61 ///ystem.err.println("**** sort by = " + args[i]);62 64 sorter = new Sort(args[i]); 63 65 } … … 96 98 break; 97 99 } 98 ///ystem.err.println("**** query = " + query_string);99 100 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);101 query_including_stop_words = query_including_stop_words.rewrite(reader);102 103 Query query = parseQuery(reader, query_parser, query_string, fuzzy);104 query = query.rewrite(reader);105 106 // Perform the query107 Hits hits;108 if (filter != null) {109 hits = searcher.search(query, filter, sorter);110 }111 else {112 hits = searcher.search(query, sorter);113 }114 100 System.out.println("<ResultSet>"); 115 101 System.out.println(" <QueryString>" + query_string + "</QueryString>"); 116 102 117 // Return the list of expanded query terms and their frequencies 118 HashMap term_counts = new HashMap(); 119 HashMap term_fields = new HashMap(); 120 HashSet terms = new HashSet(); 121 query.extractTerms(terms); 122 Iterator iter = terms.iterator(); 123 while (iter.hasNext()) 124 { 125 Term term = (Term) iter.next(); 126 // If you wanted to limit this to just TX terms add 127 // something like this: 128 //if (term.field().equals("TX")) 129 term_counts.put(term.text(), new Integer(0)); 130 term_fields.put(term.text(), term.field()); 131 } 132 133 // Do we need to use a hit iterator to get sorted results? 134 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 135 Iterator hit_iter = hits.iterator(); 136 while (hit_iter.hasNext()) 137 { 138 Hit hit = (Hit) hit_iter.next(); 139 Document doc = hit.getDocument(); 140 String node_id = doc.get("nodeID"); 141 System.out.println(" <Match id=\"" + node_id + "\" />"); 103 try { 104 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string); 105 query_including_stop_words = query_including_stop_words.rewrite(reader); 106 107 Query query = parseQuery(reader, query_parser, query_string, fuzzy); 108 query = query.rewrite(reader); 109 110 // Perform the query 111 Hits hits; 112 if (filter != null) { 113 hits = searcher.search(query, filter, sorter); 114 } 115 else { 116 hits = searcher.search(query, sorter); 117 } 118 119 // Return the list of expanded query terms and their frequencies 120 HashMap term_counts = new HashMap(); 121 HashMap term_fields = new HashMap(); 122 HashSet terms = new HashSet(); 123 query.extractTerms(terms); 124 Iterator iter = terms.iterator(); 125 while (iter.hasNext()) 126 { 127 Term term = (Term) iter.next(); 128 // If you wanted to limit this to just TX terms add 129 // something like this: 130 //if (term.field().equals("TX")) 131 term_counts.put(term.text(), new Integer(0)); 132 term_fields.put(term.text(), term.field()); 133 } 134 135 // Do we need to use a hit iterator to get sorted results? 136 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 137 Iterator hit_iter = hits.iterator(); 138 while (hit_iter.hasNext()) 139 { 140 Hit hit = (Hit) hit_iter.next(); 141 Document doc = hit.getDocument(); 142 String node_id = doc.get("nodeID"); 143 System.out.println(" <Match id=\"" + node_id + "\" />"); 142 144 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 145 // From the document, extract the Term Vector for the 146 // TX field 147 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX"); 148 if (term_freq_vector != null && term_freq_vector.size() > 0) 149 { 150 int[] term_frequencies = term_freq_vector.getTermFrequencies(); 151 // Now for each query term, determine the 152 // frequency - which may of course be 0. 153 Set term_counts_set = term_counts.keySet(); 154 Iterator terms_iter = term_counts_set.iterator(); 155 while (terms_iter.hasNext()) 156 { 157 String term = (String) terms_iter.next(); 158 Integer count_integer = (Integer) term_counts.get(term); 159 int count = count_integer.intValue(); 160 int index = term_freq_vector.indexOf(term); 161 // If the term has a count, then add to 162 // the total count for this term 163 if (index != -1) 164 { 165 count += term_frequencies[index]; 164 166 165 } 166 // Store the result 167 term_counts.put(term, new Integer(count)); 168 count_integer = null; 169 term = null; 170 } 171 terms_iter = null; 172 term_counts_set = null; 173 } 174 else 175 { 176 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 177 } 178 } 179 180 // Retrieve all the useful terms 181 Set term_counts_set = term_counts.keySet(); 182 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>"); 183 // Iterate over them 184 Iterator terms_iter = term_counts_set.iterator(); 185 while (terms_iter.hasNext()) 186 { 187 String term = (String) terms_iter.next(); 188 Integer count = (Integer) term_counts.get(term); 189 String field = (String) term_fields.get(term); 190 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />"); 191 count = null; 192 term = null; 193 } 194 // Cleanup 195 terms_iter = null; 196 term_counts_set = null; 197 198 // Return the list of stop words removed from the query 199 HashSet terms_including_stop_words = new HashSet(); 200 query_including_stop_words.extractTerms(terms_including_stop_words); 201 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator(); 202 while (terms_including_stop_words_iter.hasNext()) { 203 Term term = (Term) terms_including_stop_words_iter.next(); 204 if (!terms.contains(term)) { 205 System.out.println(" <StopWord value=\"" + term.text() + "\"/>"); 167 } 168 // Store the result 169 term_counts.put(term, new Integer(count)); 170 count_integer = null; 171 term = null; 172 } 173 terms_iter = null; 174 term_counts_set = null; 175 } 176 else 177 { 178 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 179 } 180 } 181 182 // Retrieve all the useful terms 183 Set term_counts_set = term_counts.keySet(); 184 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>"); 185 // Iterate over them 186 Iterator terms_iter = term_counts_set.iterator(); 187 while (terms_iter.hasNext()) 188 { 189 String term = (String) terms_iter.next(); 190 Integer count = (Integer) term_counts.get(term); 191 String field = (String) term_fields.get(term); 192 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />"); 193 count = null; 194 term = null; 195 } 196 // Cleanup 197 terms_iter = null; 198 term_counts_set = null; 199 200 // Return the list of stop words removed from the query 201 HashSet terms_including_stop_words = new HashSet(); 202 query_including_stop_words.extractTerms(terms_including_stop_words); 203 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator(); 204 while (terms_including_stop_words_iter.hasNext()) { 205 Term term = (Term) terms_including_stop_words_iter.next(); 206 if (!terms.contains(term)) { 207 System.out.println(" <StopWord value=\"" + term.text() + "\"/>"); 208 } 206 209 } 207 210 } 208 209 System.out.println("</ResultSet>"); 211 catch (ParseException parse_exception) { 212 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>"); 213 } 214 catch (TooManyClauses too_many_clauses_exception) { 215 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>"); 216 } 217 218 System.out.println("</ResultSet>"); 210 219 } 211 220 212 221 searcher.close(); 213 222 } 214 catch ( Exception exception) {223 catch (IOException exception) { 215 224 exception.printStackTrace(); 216 225 } 217 226 } 218 227 -
trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java
r12415 r12418 11 11 import java.io.BufferedReader; 12 12 import java.io.InputStreamReader; 13 import java.io.IOException; 13 14 import java.util.Collections; 14 15 import java.util.HashMap; … … 23 24 import org.apache.lucene.index.Term; 24 25 import org.apache.lucene.index.TermFreqVector; 26 import org.apache.lucene.queryParser.ParseException; 25 27 import org.apache.lucene.queryParser.QueryParser; 28 import org.apache.lucene.search.BooleanQuery.TooManyClauses; 26 29 import org.apache.lucene.search.Filter; 27 30 import org.apache.lucene.search.Hit; … … 39 42 { 40 43 if (args.length == 0) { 41 System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");44 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzzy] [-filter filter_string] [-sort sort_field] [-dco AND|OR]"); 42 45 return; 43 46 } … … 59 62 { 60 63 i++; 61 ///ystem.err.println("**** sort by = " + args[i]);62 64 sorter = new Sort(args[i]); 63 65 } … … 96 98 break; 97 99 } 98 ///ystem.err.println("**** query = " + query_string);99 100 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);101 query_including_stop_words = query_including_stop_words.rewrite(reader);102 103 Query query = parseQuery(reader, query_parser, query_string, fuzzy);104 query = query.rewrite(reader);105 106 // Perform the query107 Hits hits;108 if (filter != null) {109 hits = searcher.search(query, filter, sorter);110 }111 else {112 hits = searcher.search(query, sorter);113 }114 100 System.out.println("<ResultSet>"); 115 101 System.out.println(" <QueryString>" + query_string + "</QueryString>"); 116 102 117 // Return the list of expanded query terms and their frequencies 118 HashMap term_counts = new HashMap(); 119 HashMap term_fields = new HashMap(); 120 HashSet terms = new HashSet(); 121 query.extractTerms(terms); 122 Iterator iter = terms.iterator(); 123 while (iter.hasNext()) 124 { 125 Term term = (Term) iter.next(); 126 // If you wanted to limit this to just TX terms add 127 // something like this: 128 //if (term.field().equals("TX")) 129 term_counts.put(term.text(), new Integer(0)); 130 term_fields.put(term.text(), term.field()); 131 } 132 133 // Do we need to use a hit iterator to get sorted results? 134 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 135 Iterator hit_iter = hits.iterator(); 136 while (hit_iter.hasNext()) 137 { 138 Hit hit = (Hit) hit_iter.next(); 139 Document doc = hit.getDocument(); 140 String node_id = doc.get("nodeID"); 141 System.out.println(" <Match id=\"" + node_id + "\" />"); 103 try { 104 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string); 105 query_including_stop_words = query_including_stop_words.rewrite(reader); 106 107 Query query = parseQuery(reader, query_parser, query_string, fuzzy); 108 query = query.rewrite(reader); 109 110 // Perform the query 111 Hits hits; 112 if (filter != null) { 113 hits = searcher.search(query, filter, sorter); 114 } 115 else { 116 hits = searcher.search(query, sorter); 117 } 118 119 // Return the list of expanded query terms and their frequencies 120 HashMap term_counts = new HashMap(); 121 HashMap term_fields = new HashMap(); 122 HashSet terms = new HashSet(); 123 query.extractTerms(terms); 124 Iterator iter = terms.iterator(); 125 while (iter.hasNext()) 126 { 127 Term term = (Term) iter.next(); 128 // If you wanted to limit this to just TX terms add 129 // something like this: 130 //if (term.field().equals("TX")) 131 term_counts.put(term.text(), new Integer(0)); 132 term_fields.put(term.text(), term.field()); 133 } 134 135 // Do we need to use a hit iterator to get sorted results? 136 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>"); 137 Iterator hit_iter = hits.iterator(); 138 while (hit_iter.hasNext()) 139 { 140 Hit hit = (Hit) hit_iter.next(); 141 Document doc = hit.getDocument(); 142 String node_id = doc.get("nodeID"); 143 System.out.println(" <Match id=\"" + node_id + "\" />"); 142 144 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 145 // From the document, extract the Term Vector for the 146 // TX field 147 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX"); 148 if (term_freq_vector != null && term_freq_vector.size() > 0) 149 { 150 int[] term_frequencies = term_freq_vector.getTermFrequencies(); 151 // Now for each query term, determine the 152 // frequency - which may of course be 0. 153 Set term_counts_set = term_counts.keySet(); 154 Iterator terms_iter = term_counts_set.iterator(); 155 while (terms_iter.hasNext()) 156 { 157 String term = (String) terms_iter.next(); 158 Integer count_integer = (Integer) term_counts.get(term); 159 int count = count_integer.intValue(); 160 int index = term_freq_vector.indexOf(term); 161 // If the term has a count, then add to 162 // the total count for this term 163 if (index != -1) 164 { 165 count += term_frequencies[index]; 164 166 165 } 166 // Store the result 167 term_counts.put(term, new Integer(count)); 168 count_integer = null; 169 term = null; 170 } 171 terms_iter = null; 172 term_counts_set = null; 173 } 174 else 175 { 176 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 177 } 178 } 179 180 // Retrieve all the useful terms 181 Set term_counts_set = term_counts.keySet(); 182 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>"); 183 // Iterate over them 184 Iterator terms_iter = term_counts_set.iterator(); 185 while (terms_iter.hasNext()) 186 { 187 String term = (String) terms_iter.next(); 188 Integer count = (Integer) term_counts.get(term); 189 String field = (String) term_fields.get(term); 190 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />"); 191 count = null; 192 term = null; 193 } 194 // Cleanup 195 terms_iter = null; 196 term_counts_set = null; 197 198 // Return the list of stop words removed from the query 199 HashSet terms_including_stop_words = new HashSet(); 200 query_including_stop_words.extractTerms(terms_including_stop_words); 201 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator(); 202 while (terms_including_stop_words_iter.hasNext()) { 203 Term term = (Term) terms_including_stop_words_iter.next(); 204 if (!terms.contains(term)) { 205 System.out.println(" <StopWord value=\"" + term.text() + "\"/>"); 167 } 168 // Store the result 169 term_counts.put(term, new Integer(count)); 170 count_integer = null; 171 term = null; 172 } 173 terms_iter = null; 174 term_counts_set = null; 175 } 176 else 177 { 178 ///ystem.err.println("Error! Missing term vector for document " + hit.getId()); 179 } 180 } 181 182 // Retrieve all the useful terms 183 Set term_counts_set = term_counts.keySet(); 184 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>"); 185 // Iterate over them 186 Iterator terms_iter = term_counts_set.iterator(); 187 while (terms_iter.hasNext()) 188 { 189 String term = (String) terms_iter.next(); 190 Integer count = (Integer) term_counts.get(term); 191 String field = (String) term_fields.get(term); 192 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />"); 193 count = null; 194 term = null; 195 } 196 // Cleanup 197 terms_iter = null; 198 term_counts_set = null; 199 200 // Return the list of stop words removed from the query 201 HashSet terms_including_stop_words = new HashSet(); 202 query_including_stop_words.extractTerms(terms_including_stop_words); 203 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator(); 204 while (terms_including_stop_words_iter.hasNext()) { 205 Term term = (Term) terms_including_stop_words_iter.next(); 206 if (!terms.contains(term)) { 207 System.out.println(" <StopWord value=\"" + term.text() + "\"/>"); 208 } 206 209 } 207 210 } 208 209 System.out.println("</ResultSet>"); 211 catch (ParseException parse_exception) { 212 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>"); 213 } 214 catch (TooManyClauses too_many_clauses_exception) { 215 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>"); 216 } 217 218 System.out.println("</ResultSet>"); 210 219 } 211 220 212 221 searcher.close(); 213 222 } 214 catch ( Exception exception) {223 catch (IOException exception) { 215 224 exception.printStackTrace(); 216 225 } 217 226 } 218 227
Note:
See TracChangeset
for help on using the changeset viewer.