Context Navigation

← Previous Changeset
Next Changeset →

Changeset 36964

Timestamp:

2022-11-30T17:22:16+13:00 (17 months ago)

Author:

davidb

Message:

Experimenting with cruder version of algorithm to return k-nearest matching AV docs; also expanded the XML message returned to include AV values for each match

Location:

gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3

Files:

: 3 edited

service/GS2WekaDBSearch.java (modified) (1 diff)
util/WekaDBDocInfo.java (modified) (3 diffs)
util/WekaDBWrapper.java (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/service/GS2WekaDBSearch.java

-              r36859
+              r36964
             String doc_id  = wdb_doc.getDocID();
+            double arousal = wdb_doc.getTopArousal();
+            double valence = wdb_doc.getTopValence();
             double rank    = wdb_doc.getTopRank();
             String offsets = wdb_doc.getOffsetList();
+            Element doc_node = createDocNode (result_doc, doc_id, Double.toString (rank));
+            Element doc_node = createDocNode(result_doc, doc_id, Double.toString(rank));
+            doc_node.setAttribute("arousalVal", Double.toString(arousal));
+            doc_node.setAttribute("valenceVal", Double.toString(valence));
             doc_node.setAttribute("frameOffset", offsets);

gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBDocInfo.java

-              r36864
+              r36964
+{
     public String oid_;
+    public Vector<Double> arousalVector_;
+    public Vector<Double> valenceVector_;
     public Vector<Double> rankVector_;
     public Vector<Integer> offsetVector_;
     public WekaDBDocInfo(String doc_oid, double rank, int offset)
+    public WekaDBDocInfo(String doc_oid, double arousal, double valence, double rank, int offset)
+    {
     oid_ = doc_oid;
+    arousalVector_ = new Vector<Double>();
+    arousalVector_.add (arousal);
+    valenceVector_ = new Vector<Double>();
+    valenceVector_.add (valence);
     rankVector_ = new Vector<Double>();
     rankVector_.add (rank);
 …
+    public WekaDBDocInfo(String doc_oid, Vector<Double> rankVector, Vector<Integer> offsetVector)
+    public WekaDBDocInfo(String doc_oid, Vector<Double> arousalVector, Vector<Double> valenceVector,
+             Vector<Double> rankVector, Vector<Integer> offsetVector)
+    {
     oid_ = doc_oid;
+    rankVector_ = rankVector;
+    arousalVector_ = arousalVector;
+    valenceVector_ = valenceVector;
+    rankVector_   = rankVector;
     offsetVector_ = offsetVector;
+    }
 …
+    {
     return oid_;
+    }
+    public double getTopArousal()
+    {
+    return arousalVector_.get(0);
+    }
+    public double getTopValence()
+    {
+    return valenceVector_.get(0);
+    }

gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBWrapper.java

-              r36864
+              r36964
+    protected boolean addQueryResult(boolean first_entry, String doc_id,
+    protected boolean addQueryResult(boolean first_entry, String doc_id,
+                     Vector<Double> arousalVector, Vector<Double> valenceVector,
                      Vector<Double> rankVector, Vector<Integer> offsetVector)
+    {
     if (first_entry) {
         WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,arousalVector,valenceVector,rankVector,offsetVector);
         query_results_.add(wekaDB_doc_info);
         first_entry = false;
+    }
     else {
+        double rank = rankVector.get(0);
+        int offset = offsetVector.get(0);
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
+        double arousal = arousalVector.get(0);
+        double valence = valenceVector.get(0);
+        double rank    = rankVector.get(0);
+        int offset     = offsetVector.get(0);
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,arousal,valence,rank,offset);
         query_results_.add(wekaDB_doc_info);
 …
      * - maintains state between requests as can be slow
      * base_dir and index_path should join together to provide
      * the absolute location of the mg index files eg ..../index/dtx/demo
+     * the absolute location of the Weka CSV file e.g.  <col>/index/wekaDB/av-features.csv
      * base_dir must end with a file separator (OS dependant)
      */
 …
     public void runQuery(String wekaDB_index_dir, String knn_model_file,
+             String assoc_index_dir, String query_string) {
+    // combine index_dir with audiodb fileanem
+             String assoc_index_dir, String query_string)
+    {
     String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
+    //String full_chr12_filename = assoc_index_dir + File.separatorChar
+    //    + query_string + File.separatorChar + "doc.chr12";
+    System.err.println("**** full knn model filename  = " + full_knn_model_filename);
+    //System.err.println("**** full knn model filename  = " + full_knn_model_filename);
     // Example returned result from Weka KNN
 …
     double query_arousal_val = arousal_;
     double query_valence_val =  valence_;
+    double query_valence_val = valence_;
     int k_nearest_num = max_docs_;
 …
     int nearest_instances_len = nearest_instances.size();
+    int clamped_expanded_k_nearest_num = Math.min(expanded_k_nearest_num,nearest_instances_len);
+    double pos_penalty = 0.1;
+    int    topup_count = 0;
+    int clamped_expanded_k_nearest_num = Math.max(nearest_instances_len,k_nearest_num);
+    if (clamped_expanded_k_nearest_num > k_nearest_num) {
+        System.err.println("**** expanded number of k-nearest matches = " + clamped_expanded_k_nearest_num);
+    }
     for (int ei=0; ei<clamped_expanded_k_nearest_num; ei++) {
         Instance instance = nearest_instances.instance(ei);
-        logger.info("\tProcessing returned instance: " + instance);
         String matching_doc_id_segment = instance.stringValue(0);
 …
         if (matching_doc_id.equals(doc_id)) {
             // don't add in matches that come from a matching segment in the query doc
+            //logger.info("\tSelf-match with query doc => Skipping: " + instance);
+            System.err.println("\tSelf-match with query doc => Skipping: " + instance);
+            continue;
+        }
+        //logger.info("\tAdding returned instance: " + instance);
+        System.err.println("\tAdding returned instance: " + instance);
+        double matching_arousal_val = instance.value(1);
+        double matching_valence_val = instance.value(2);
+        double matching_diff = (Math.abs(query_arousal_val - matching_arousal_val)
+                    + Math.abs(query_valence_val - matching_valence_val))/4.0;
+        double matching_rank = 1.0 - matching_diff;
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,
+                                  matching_arousal_val, matching_valence_val,
+                                  matching_rank,matching_segment_offset);
+        expanded_query_results.add(wekaDB_doc_info);
+        }
+        else {
+        logger.error("Returned AV k-nearest neighbour match '"+matching_doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
+        }
+    }
+    //Collections.sort(expanded_query_results);
+    query_results_ = new Vector();
+    int i = 0;
+    while (i < k_nearest_num) {
+        if (i >= expanded_query_results.size()) {
+        break;
+        }
+        query_results_.add(expanded_query_results.get(i));
+        i++;
+    }
+    //Collections.sort(query_results_);
+    }
+    public void runQueryDiffAndMerge(String wekaDB_index_dir, String knn_model_file,
+             String assoc_index_dir, String query_string)
+    {
+    // combine index_dir with audiodb fileanem
+    String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
+    //String full_chr12_filename = assoc_index_dir + File.separatorChar
+    //    + query_string + File.separatorChar + "doc.chr12";
+    System.err.println("**** full knn model filename  = " + full_knn_model_filename);
+    // Example returned result from Weka KNN
+    // => first line is the input instance ('filename+segment',Arousal,Valence)
+    //    following (indented lines) nearest neighbour matches in same format
+    //
+    // ds_22716_5743-6,-0.549489,-0.118439
+    //  ds_22716_5743-6,-0.549489,-0.118439
+    //  ds_31008_6550-30,-0.549489,-0.118439
+    //  ds_72651_26831-6,-0.549489,-0.118439
+    //  ds_26196_9214-18,-0.549489,-0.118439
+    WekaFindInstanceKNN.init(full_knn_model_filename);
+    String doc_id  = query_string;
+    int    segment = offset_;
+    String query_doc_id_segment =  doc_id + "-" + segment;
+    double query_arousal_val = arousal_;
+    double query_valence_val =  valence_;
+    int k_nearest_num = max_docs_;
+    int expanded_k_nearest_num = max_docs_ * 5; // * internally get more matches, then sift through to arrive at the best 'max_docs_'
+    Pattern doc_seg_re = Pattern.compile("^(\\w+)-(\\d+)$");
+    //Matcher query_doc_seg_match = doc_seq_re.matcher(query_doc_id_segment);
+    Instances nearest_instances
+        = WekaFindInstanceKNN.kNearestNeighbours(query_doc_id_segment,query_arousal_val,query_valence_val,k_nearest_num);
+    Vector expanded_query_results = new Vector();
+    int nearest_instances_len = nearest_instances.size();
+    int clamped_expanded_k_nearest_num = Math.min(expanded_k_nearest_num,nearest_instances_len);
+    double pos_penalty = 0.1;
+    int    topup_count = 0;
+    for (int ei=0; ei<clamped_expanded_k_nearest_num; ei++) {
+        Instance instance = nearest_instances.instance(ei);
+        logger.info("\tProcessing returned instance: " + instance);
+        String matching_doc_id_segment = instance.stringValue(0);
+        //Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
+        Matcher m = doc_seg_re.matcher(matching_doc_id_segment);
+        if (m.matches()) {
+        String matching_doc_id = m.group(1);
+        int end_of_matching_segment_offset = Integer.parseInt(m.group(2));
+        //int matching_segment_offset = end_of_matching_segment_offset - (int)AV_SEGMENT_LENGTH_SECS;
+        int matching_segment_offset = end_of_matching_segment_offset;
+        if (matching_doc_id.equals(doc_id)) {
+            // don't add in matches that come from a matching segment in the query doc
             continue;
+        }
 …
         logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
+        WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,
+                                  matching_arousal_val, matching_valence_val,
+                                  matching_rank,matching_segment_offset);
         //expanded_query_results.add(wekaDB_doc_info);
 …
     //Collections.sort(query_results_);
+    }
-    public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
-             String assoc_index_dir, String query_string) {
-    // combine index_dir with audiodb fileanem
-    String full_knn_model_filename  = wekaDB_index_dir + File.separatorChar + knn_model_file;
-    String full_chr12_filename = assoc_index_dir + File.separatorChar
-        + query_string + File.separatorChar + "doc.chr12";
-    int num_matches_within_track = 6;
-    // ****
-    String [] cmd_array = new String[] {
-        "java", "-jar", "weka.jar",
-        "-d", full_knn_model_filename,
-        "-Q", "nsequence",
-        "-p", String.format("%d",offset_),
-        "-n", String.format("%d",num_matches_within_track),
-        "-l", String.format("%d",length_),
-        "-r", String.format("%d",max_docs_),
-        "-f", full_chr12_filename
-    };
-    System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
-    Runtime runtime = Runtime.getRuntime();
-    try {
-        Process wekaDB_proc = runtime.exec(cmd_array);
-        //int exitVal = wekaDB_proc.waitFor();
-        //System.err.println("*** exit status = " + exitVal);
-        InputStream wis = wekaDB_proc.getInputStream();
-        InputStreamReader wisr = new InputStreamReader(wis);
-        BufferedReader wbr = new BufferedReader(wisr);
-        query_results_ = new Vector();
-        boolean first_entry = true;
-        int line_count = 0;
-        String root_doc_id = null;
-        Vector<Double> rankVector = new Vector<Double>();
-        Vector<Integer> offsetVector = new Vector<Integer>();
-        // Example output
-        //   D8 0.00105175
-        //   1.69786e-16 392 392
-        //   0.00113568 392 673
-        //   0.00127239 392 910
-        //   0.00139736 392 481
-        //   0.00145331 392 303
-        //   D2 0.00429758
-        //   0.00403335 392 865
-        //   0.00411288 392 458
-        //   0.00442461 392 866
-        //   0.00444272 392 864
-        //   0.00447434 392 424
-        // ...
-        String line;
-        while ((line = wbr.readLine()) != null) {
-        String[] tokens = line.split("\\s+");
-        line_count++;
-        if (tokens.length==2) {
-            // processing a top-level doc line
-            if (line_count>1) {
-            // struck new top-level entry => store vector vals for previous block
-            first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
-            // and now reset vectors to empty to be ready for next chain of values
-            rankVector = new Vector<Double>();
-            offsetVector = new Vector<Integer>();
+            }
-            root_doc_id = tokens[0];
+        }
-        else {
-            // should be 3 items
-            double euclidean_dist = Double.parseDouble(tokens[0]);
-            int src_frame = Integer.parseInt(tokens[1]);
-            int target_frame = Integer.parseInt(tokens[2]);
-            // ****
-            // enforce 1.0 as upper limit due to rounding errors
-            // in audioDB distance calculations
-            double rank = Math.min(1.0 - euclidean_dist,1.0);
-            if ((line_count==2) && (src_frame==target_frame)) {
-            // Found match with self
-            continue;
+            }
-            rankVector.add(rank);
-            offsetVector.add(target_frame);
+        }
+        }
-        addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
-        wbr.close();
-        // sort query_results_ on 'rank' field
-        // note: compareTo() method impelemented to sort into descending order
-        Collections.sort(query_results_);
+    }
-    catch (Exception e) {
-        logger.error("Failed to execute the following command: " +  String.join(" ", cmd_array));
-        e.printStackTrace();
+    }
+    }

Note: See TracChangeset for help on using the changeset viewer.