Changeset 36864 for gs3-extensions


Ignore:
Timestamp:
2022-10-24T08:33:26+13:00 (18 months ago)
Author:
davidb
Message:

Attempt at a result set that merges later matches from the same doc-id

Location:
gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBDocInfo.java

    r35177 r36864  
    6161    }
    6262
     63    public void incTopRank(double inc_rank_val)
     64    {
     65    double top_rank = rankVector_.get(0);
     66
     67    double new_top_rank = top_rank + inc_rank_val;
     68    rankVector_.set(0,new_top_rank);
     69    }
     70
    6371
    6472    public String getOffsetList()
  • gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBWrapper.java

    r36863 r36864  
    4444   
    4545    /** the query result, filled in by runQuery */
    46     protected Vector query_result_;
     46    protected Vector query_results_;
    4747
    4848    protected int offset_ = 100;
     
    6060
    6161    public WekaDBWrapper() { 
    62     query_result_ = null;
     62    query_results_ = null;
    6363    }
    6464
     
    104104    if (first_entry) {
    105105        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
    106         query_result_.add(wekaDB_doc_info);
     106        query_results_.add(wekaDB_doc_info);
    107107        first_entry = false;
    108108    }
     
    112112        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
    113113       
    114         query_result_.add(wekaDB_doc_info);
     114        query_results_.add(wekaDB_doc_info);
    115115    }
    116116
     
    119119
    120120
     121    protected int mergeResultDoc(Vector query_results, WekaDBDocInfo new_doc_info, double inc_rank_val)
     122    {
     123    int merged = 0;
     124
     125    String new_doc_id = new_doc_info.getDocID();
     126
     127    final int query_results_len = query_results.size();
     128
     129    for (int i=0; i<query_results_len; i++) {
     130        WekaDBDocInfo existing_doc_info = (WekaDBDocInfo)query_results.get(i);
     131
     132        String existing_doc_id = existing_doc_info.getDocID();
     133        if (new_doc_id.equals(existing_doc_id)) {
     134        merged = 1;
     135        existing_doc_info.incTopRank(inc_rank_val);
     136        break;
     137        }
     138    }
     139
     140    if (merged == 0) {
     141        query_results.add(new_doc_info);
     142    }
     143   
     144    return merged;
     145    }
     146   
    121147    /** actually carry out the query.
    122148    Use the set methods to set query results.
    123     Writes the result to query_result.
     149    Writes the result to query_results.
    124150     * - maintains state between requests as can be slow 
    125151     * base_dir and index_path should join together to provide
     
    172198
    173199       
    174     Vector expanded_query_result = new Vector();
     200    Vector expanded_query_results = new Vector();
    175201
    176202    int nearest_instances_len = nearest_instances.size();
     
    179205
    180206    double pos_penalty = 0.1;
     207    int    topup_count = 0;
    181208   
    182209    for (int ei=0; ei<clamped_expanded_k_nearest_num; ei++) {
     
    186213        String matching_doc_id_segment = instance.stringValue(0);
    187214
    188        
    189215        //Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
    190216        Matcher m = doc_seg_re.matcher(matching_doc_id_segment);
     
    193219        String matching_doc_id = m.group(1);
    194220        int end_of_matching_segment_offset = Integer.parseInt(m.group(2));
    195         int matching_segment_offset = end_of_matching_segment_offset - (int)AV_SEGMENT_LENGTH_SECS;
     221        //int matching_segment_offset = end_of_matching_segment_offset - (int)AV_SEGMENT_LENGTH_SECS;
     222        int matching_segment_offset = end_of_matching_segment_offset;
    196223       
    197224        if (matching_doc_id.equals(doc_id)) {
     225            // don't add in matches that come from a matching segment in the query doc
    198226            continue;
    199227        }
     
    208236        logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
    209237        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
    210         expanded_query_result.add(wekaDB_doc_info);
     238       
     239        //expanded_query_results.add(wekaDB_doc_info);
     240
     241        double inc_rank_val = matching_rank / (double)(topup_count+2); // starts to a 50% (/2) weighting when topup_count == 0
     242        int merged = mergeResultDoc(expanded_query_results,wekaDB_doc_info,inc_rank_val);
     243
     244        topup_count += merged;
     245
     246        if ((expanded_query_results.size() > k_nearest_num) && (topup_count > k_nearest_num)) {
     247            // guard to stop multiple recurring matches in the same doc dominationg the rank_val
     248            break;
     249        }
    211250        }
    212251        else {
     
    214253        }         
    215254    }
    216    
    217     query_result_ = new Vector();
     255
     256    Collections.sort(expanded_query_results);
     257       
     258    query_results_ = new Vector();
    218259
    219260    int i = 0;
    220261    while (i < k_nearest_num) {
    221         if (i >= expanded_query_result.size()) {
     262        if (i >= expanded_query_results.size()) {
    222263        break;
    223264        }
    224265       
    225         query_result_.add(expanded_query_result.get(i));
     266        query_results_.add(expanded_query_results.get(i));
    226267        i++;
    227268    }
    228269
    229     Collections.sort(query_result_);   
     270    //Collections.sort(query_results_);
    230271    }
    231272   
     
    265306        BufferedReader wbr = new BufferedReader(wisr);
    266307
    267         query_result_ = new Vector();
     308        query_results_ = new Vector();
    268309
    269310        boolean first_entry = true;
     
    335376        wbr.close();
    336377
    337         // sort query_result_ on 'rank' field
     378        // sort query_results_ on 'rank' field
    338379        // note: compareTo() method impelemented to sort into descending order
    339380
    340         Collections.sort(query_result_);
     381        Collections.sort(query_results_);
    341382
    342383
     
    353394    public Vector getQueryResult()
    354395    {
    355     return query_result_;
     396    return query_results_;
    356397    }
    357398}
Note: See TracChangeset for help on using the changeset viewer.