Changeset 36964


Ignore:
Timestamp:
2022-11-30T17:22:16+13:00 (17 months ago)
Author:
davidb
Message:

Experimenting with cruder version of algorithm to return k-nearest matching AV docs; also expanded the XML message returned to include AV values for each match

Location:
gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/service/GS2WekaDBSearch.java

    r36859 r36964  
    150150
    151151            String doc_id  = wdb_doc.getDocID();
     152
     153            double arousal = wdb_doc.getTopArousal();
     154            double valence = wdb_doc.getTopValence();           
     155
    152156            double rank    = wdb_doc.getTopRank();
    153157            String offsets = wdb_doc.getOffsetList();
    154158
    155             Element doc_node = createDocNode (result_doc, doc_id, Double.toString (rank));
     159            Element doc_node = createDocNode(result_doc, doc_id, Double.toString(rank));
     160            doc_node.setAttribute("arousalVal", Double.toString(arousal));
     161            doc_node.setAttribute("valenceVal", Double.toString(valence));
    156162            doc_node.setAttribute("frameOffset", offsets);
    157163
  • gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBDocInfo.java

    r36864 r36964  
    2929{
    3030    public String oid_;
     31    public Vector<Double> arousalVector_;
     32    public Vector<Double> valenceVector_;
    3133    public Vector<Double> rankVector_;
    3234    public Vector<Integer> offsetVector_;
    3335
    34     public WekaDBDocInfo(String doc_oid, double rank, int offset)
     36    public WekaDBDocInfo(String doc_oid, double arousal, double valence, double rank, int offset)
    3537    {
    3638    oid_ = doc_oid;
    37    
     39
     40    arousalVector_ = new Vector<Double>();
     41    arousalVector_.add (arousal);
     42    valenceVector_ = new Vector<Double>();
     43    valenceVector_.add (valence);
     44
    3845    rankVector_ = new Vector<Double>();
    3946    rankVector_.add (rank);
     
    4451
    4552
    46     public WekaDBDocInfo(String doc_oid, Vector<Double> rankVector, Vector<Integer> offsetVector)
     53    public WekaDBDocInfo(String doc_oid, Vector<Double> arousalVector, Vector<Double> valenceVector,
     54             Vector<Double> rankVector, Vector<Integer> offsetVector)
    4755    {
    4856    oid_ = doc_oid;
    49     rankVector_ = rankVector;
     57
     58    arousalVector_ = arousalVector;
     59    valenceVector_ = valenceVector;
     60   
     61    rankVector_   = rankVector;
    5062    offsetVector_ = offsetVector;
    5163    }
     
    5466    {
    5567    return oid_;
     68    }
     69
     70    public double getTopArousal()
     71    {
     72    return arousalVector_.get(0);
     73    }
     74
     75    public double getTopValence()
     76    {
     77    return valenceVector_.get(0);
    5678    }
    5779
  • gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBWrapper.java

    r36864 r36964  
    9898
    9999
    100     protected boolean addQueryResult(boolean first_entry, String doc_id,
     100    protected boolean addQueryResult(boolean first_entry, String doc_id,
     101                     Vector<Double> arousalVector, Vector<Double> valenceVector,
    101102                     Vector<Double> rankVector, Vector<Integer> offsetVector)
    102103    {
    103104
    104105    if (first_entry) {
    105         WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
     106        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,arousalVector,valenceVector,rankVector,offsetVector);
    106107        query_results_.add(wekaDB_doc_info);
    107108        first_entry = false;
    108109    }
    109110    else {
    110         double rank = rankVector.get(0);
    111         int offset = offsetVector.get(0);
    112         WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
     111        double arousal = arousalVector.get(0);
     112        double valence = valenceVector.get(0);
     113
     114        double rank    = rankVector.get(0);
     115        int offset     = offsetVector.get(0);
     116       
     117        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,arousal,valence,rank,offset);
    113118       
    114119        query_results_.add(wekaDB_doc_info);
     
    150155     * - maintains state between requests as can be slow 
    151156     * base_dir and index_path should join together to provide
    152      * the absolute location of the mg index files eg ..../index/dtx/demo
     157     * the absolute location of the Weka CSV file e.g.  <col>/index/wekaDB/av-features.csv
    153158     * base_dir must end with a file separator (OS dependant)
    154159     */
     
    156161
    157162    public void runQuery(String wekaDB_index_dir, String knn_model_file,
    158              String assoc_index_dir, String query_string) {
    159 
    160     // combine index_dir with audiodb fileanem
     163             String assoc_index_dir, String query_string)
     164    {
    161165
    162166    String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
    163167
    164     //String full_chr12_filename = assoc_index_dir + File.separatorChar
    165     //    + query_string + File.separatorChar + "doc.chr12";
    166 
    167     System.err.println("**** full knn model filename  = " + full_knn_model_filename);
     168    //System.err.println("**** full knn model filename  = " + full_knn_model_filename);
    168169
    169170    // Example returned result from Weka KNN
     
    186187
    187188    double query_arousal_val = arousal_;
    188     double query_valence_val =  valence_;
     189    double query_valence_val = valence_;
    189190
    190191    int k_nearest_num = max_docs_;
     
    202203    int nearest_instances_len = nearest_instances.size();
    203204   
    204     int clamped_expanded_k_nearest_num = Math.min(expanded_k_nearest_num,nearest_instances_len);
    205 
    206     double pos_penalty = 0.1;
    207     int    topup_count = 0;
     205    int clamped_expanded_k_nearest_num = Math.max(nearest_instances_len,k_nearest_num);
     206
     207    if (clamped_expanded_k_nearest_num > k_nearest_num) {
     208        System.err.println("**** expanded number of k-nearest matches = " + clamped_expanded_k_nearest_num);
     209    }
    208210   
    209211    for (int ei=0; ei<clamped_expanded_k_nearest_num; ei++) {
    210212        Instance instance = nearest_instances.instance(ei);
    211         logger.info("\tProcessing returned instance: " + instance);
    212213       
    213214        String matching_doc_id_segment = instance.stringValue(0);
     
    224225        if (matching_doc_id.equals(doc_id)) {
    225226            // don't add in matches that come from a matching segment in the query doc
     227            //logger.info("\tSelf-match with query doc => Skipping: " + instance);
     228            System.err.println("\tSelf-match with query doc => Skipping: " + instance);
     229
     230            continue;
     231        }
     232
     233        //logger.info("\tAdding returned instance: " + instance);
     234        System.err.println("\tAdding returned instance: " + instance);
     235
     236        double matching_arousal_val = instance.value(1);
     237        double matching_valence_val = instance.value(2);
     238       
     239        double matching_diff = (Math.abs(query_arousal_val - matching_arousal_val)
     240                    + Math.abs(query_valence_val - matching_valence_val))/4.0;
     241        double matching_rank = 1.0 - matching_diff;
     242       
     243        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,
     244                                  matching_arousal_val, matching_valence_val,
     245                                  matching_rank,matching_segment_offset);
     246       
     247        expanded_query_results.add(wekaDB_doc_info);
     248        }
     249        else {
     250        logger.error("Returned AV k-nearest neighbour match '"+matching_doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
     251        }         
     252    }
     253
     254    //Collections.sort(expanded_query_results);
     255       
     256    query_results_ = new Vector();
     257
     258    int i = 0;
     259    while (i < k_nearest_num) {
     260        if (i >= expanded_query_results.size()) {
     261        break;
     262        }
     263       
     264        query_results_.add(expanded_query_results.get(i));
     265        i++;
     266    }
     267
     268    //Collections.sort(query_results_);
     269    }
     270
     271
     272
     273    public void runQueryDiffAndMerge(String wekaDB_index_dir, String knn_model_file,
     274             String assoc_index_dir, String query_string)
     275    {
     276
     277    // combine index_dir with audiodb fileanem
     278
     279    String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
     280
     281    //String full_chr12_filename = assoc_index_dir + File.separatorChar
     282    //    + query_string + File.separatorChar + "doc.chr12";
     283
     284    System.err.println("**** full knn model filename  = " + full_knn_model_filename);
     285
     286    // Example returned result from Weka KNN
     287    // => first line is the input instance ('filename+segment',Arousal,Valence)
     288    //    following (indented lines) nearest neighbour matches in same format
     289    //
     290    // ds_22716_5743-6,-0.549489,-0.118439
     291    //  ds_22716_5743-6,-0.549489,-0.118439
     292    //  ds_31008_6550-30,-0.549489,-0.118439
     293    //  ds_72651_26831-6,-0.549489,-0.118439
     294    //  ds_26196_9214-18,-0.549489,-0.118439
     295
     296
     297    WekaFindInstanceKNN.init(full_knn_model_filename);
     298
     299    String doc_id  = query_string;
     300    int    segment = offset_;
     301
     302    String query_doc_id_segment =  doc_id + "-" + segment;
     303
     304    double query_arousal_val = arousal_;
     305    double query_valence_val =  valence_;
     306
     307    int k_nearest_num = max_docs_;
     308    int expanded_k_nearest_num = max_docs_ * 5; // * internally get more matches, then sift through to arrive at the best 'max_docs_'
     309   
     310    Pattern doc_seg_re = Pattern.compile("^(\\w+)-(\\d+)$");
     311    //Matcher query_doc_seg_match = doc_seq_re.matcher(query_doc_id_segment);
     312   
     313    Instances nearest_instances
     314        = WekaFindInstanceKNN.kNearestNeighbours(query_doc_id_segment,query_arousal_val,query_valence_val,k_nearest_num);
     315
     316       
     317    Vector expanded_query_results = new Vector();
     318
     319    int nearest_instances_len = nearest_instances.size();
     320   
     321    int clamped_expanded_k_nearest_num = Math.min(expanded_k_nearest_num,nearest_instances_len);
     322
     323    double pos_penalty = 0.1;
     324    int    topup_count = 0;
     325   
     326    for (int ei=0; ei<clamped_expanded_k_nearest_num; ei++) {
     327        Instance instance = nearest_instances.instance(ei);
     328        logger.info("\tProcessing returned instance: " + instance);
     329       
     330        String matching_doc_id_segment = instance.stringValue(0);
     331
     332        //Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
     333        Matcher m = doc_seg_re.matcher(matching_doc_id_segment);
     334        if (m.matches()) {
     335
     336        String matching_doc_id = m.group(1);
     337        int end_of_matching_segment_offset = Integer.parseInt(m.group(2));
     338        //int matching_segment_offset = end_of_matching_segment_offset - (int)AV_SEGMENT_LENGTH_SECS;
     339        int matching_segment_offset = end_of_matching_segment_offset;
     340       
     341        if (matching_doc_id.equals(doc_id)) {
     342            // don't add in matches that come from a matching segment in the query doc
    226343            continue;
    227344        }
     
    235352
    236353        logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
    237         WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
     354        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,
     355                                  matching_arousal_val, matching_valence_val,
     356                                  matching_rank,matching_segment_offset);
    238357       
    239358        //expanded_query_results.add(wekaDB_doc_info);
     
    270389    //Collections.sort(query_results_);
    271390    }
    272    
    273     public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
    274              String assoc_index_dir, String query_string) {
    275 
    276     // combine index_dir with audiodb fileanem
    277 
    278     String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
    279     String full_chr12_filename = assoc_index_dir + File.separatorChar
    280         + query_string + File.separatorChar + "doc.chr12";
    281 
    282     int num_matches_within_track = 6;
    283 
    284     // ****
    285     String [] cmd_array = new String[] {
    286         "java", "-jar", "weka.jar",
    287         "-d", full_knn_model_filename,
    288         "-Q", "nsequence",
    289         "-p", String.format("%d",offset_),
    290         "-n", String.format("%d",num_matches_within_track),
    291         "-l", String.format("%d",length_),
    292         "-r", String.format("%d",max_docs_),
    293         "-f", full_chr12_filename
    294     };
    295 
    296     System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
    297 
    298     Runtime runtime = Runtime.getRuntime();
    299     try {
    300         Process wekaDB_proc = runtime.exec(cmd_array);
    301         //int exitVal = wekaDB_proc.waitFor();
    302         //System.err.println("*** exit status = " + exitVal);
    303 
    304         InputStream wis = wekaDB_proc.getInputStream();
    305         InputStreamReader wisr = new InputStreamReader(wis);
    306         BufferedReader wbr = new BufferedReader(wisr);
    307 
    308         query_results_ = new Vector();
    309 
    310         boolean first_entry = true;
    311         int line_count = 0;
    312 
    313         String root_doc_id = null;
    314         Vector<Double> rankVector = new Vector<Double>();
    315         Vector<Integer> offsetVector = new Vector<Integer>();
    316 
    317         // Example output
    318         //   D8 0.00105175
    319         //   1.69786e-16 392 392
    320         //   0.00113568 392 673
    321         //   0.00127239 392 910
    322         //   0.00139736 392 481
    323         //   0.00145331 392 303
    324         //   D2 0.00429758
    325         //   0.00403335 392 865
    326         //   0.00411288 392 458
    327         //   0.00442461 392 866
    328         //   0.00444272 392 864
    329         //   0.00447434 392 424
    330         // ...
    331 
    332         String line;
    333         while ((line = wbr.readLine()) != null) {
    334         String[] tokens = line.split("\\s+");
    335         line_count++;
    336 
    337         if (tokens.length==2) {
    338             // processing a top-level doc line
    339 
    340             if (line_count>1) {
    341             // struck new top-level entry => store vector vals for previous block
    342 
    343             first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
    344             // and now reset vectors to empty to be ready for next chain of values
    345             rankVector = new Vector<Double>();
    346             offsetVector = new Vector<Integer>();
    347             }
    348 
    349             root_doc_id = tokens[0];
    350         }
    351         else {
    352             // should be 3 items
    353             double euclidean_dist = Double.parseDouble(tokens[0]);
    354             int src_frame = Integer.parseInt(tokens[1]);
    355             int target_frame = Integer.parseInt(tokens[2]);
    356 
    357             // ****
    358            
    359             // enforce 1.0 as upper limit due to rounding errors
    360             // in audioDB distance calculations
    361             double rank = Math.min(1.0 - euclidean_dist,1.0);
    362 
    363             if ((line_count==2) && (src_frame==target_frame)) {
    364             // Found match with self
    365             continue;
    366             }
    367 
    368             rankVector.add(rank);
    369             offsetVector.add(target_frame);
    370         }
    371 
    372         }
    373 
    374         addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
    375 
    376         wbr.close();
    377 
    378         // sort query_results_ on 'rank' field
    379         // note: compareTo() method impelemented to sort into descending order
    380 
    381         Collections.sort(query_results_);
    382 
    383 
    384     }
    385     catch (Exception e) {
    386         logger.error("Failed to execute the following command: " +  String.join(" ", cmd_array));
    387         e.printStackTrace();
    388     }
    389 
    390     }
    391391
    392392
Note: See TracChangeset for help on using the changeset viewer.