Changeset 12775


Ignore:
Timestamp:
2006-09-19T12:55:21+12:00 (18 years ago)
Author:
mdewsnip
Message:

Fixed bug where some terms have zero frequency (because they don't actually appear in the matching documents).

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

    r12773 r12775  
    215215            Integer count = (Integer) term_counts.get(term);
    216216            String field = (String) term_fields.get(term);
    217             System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
     217
     218            // Ignore any terms with zero frequency, because they don't exist in the matching
     219            // documents. It seems that this should never happen, but it's a consequence of
     220            // how the terms are identified. The terms are found by rewriting the query (above).
     221            // At this point, the query hasn't been run, so each query term is expanded without
     222            // knowing whether the expanded term will actually appear in one of the resulting
     223            // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
     224            // the search is for "otago AND auckland", no matching documents may include "otaio".
     225            // Hopefully that made some sense...
     226            if (count.intValue() > 0) {
     227                System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
     228            }
    218229            count = null;
    219230            term = null;
    220231            }
     232
    221233            // Cleanup
    222234            terms_iter = null;
  • trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r12773 r12775  
    215215            Integer count = (Integer) term_counts.get(term);
    216216            String field = (String) term_fields.get(term);
    217             System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
     217
     218            // Ignore any terms with zero frequency, because they don't exist in the matching
     219            // documents. It seems that this should never happen, but it's a consequence of
     220            // how the terms are identified. The terms are found by rewriting the query (above).
     221            // At this point, the query hasn't been run, so each query term is expanded without
     222            // knowing whether the expanded term will actually appear in one of the resulting
     223            // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
     224            // the search is for "otago AND auckland", no matching documents may include "otaio".
     225            // Hopefully that made some sense...
     226            if (count.intValue() > 0) {
     227                System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
     228            }
    218229            count = null;
    219230            term = null;
    220231            }
     232
    221233            // Cleanup
    222234            terms_iter = null;
Note: See TracChangeset for help on using the changeset viewer.