Changeset 8731
- Timestamp:
- 2004-12-03T13:09:47+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/action/DocumentAction.java
r8714 r8731 31 31 32 32 // General Java classes 33 import java.util.ArrayList; 33 34 import java.util.HashMap; 34 35 import java.util.HashSet; … … 48 49 * type services (enrich) will be offered to the user as well */ 49 50 protected static final boolean provide_annotations = false; //true; 51 50 52 51 53 public Element process (Element message) … … 540 542 541 543 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER); 542 Element query_term_ info_list = (Element) GSXML.getNodeByPath(mr_query_response, path);543 if (query_term_ info_list == null) {544 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); 545 if (query_term_list_element == null) { 544 546 // no term info 545 547 System.err.println("DocumentAction: Warning: No query term information.\n"); … … 547 549 } 548 550 549 NodeList equivs = query_term_info_list.getElementsByTagName("equivTermList");550 HashSet all_terms = new HashSet();551 for (int i=0; i<equivs.getLength(); i++) {552 553 // get the terms554 String [] terms = GSXML.getAttributeValuesFromList((Element)equivs.item(i), GSXML.NAME_ATT);555 for (int j=0; j<terms.length; j++) {556 557 all_terms.add(terms[j]);558 }559 }560 561 Element new_content_elem = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);562 563 551 String content = GSXML.getNodeText(dc_response_doc_content); 564 552 565 StringBuffer temp = new StringBuffer(); 566 StringBuffer temp_content = new StringBuffer(); 567 568 for (int i=0; i<content.length(); i++) { 569 char c = content.charAt(i); 570 if (Character.isLetterOrDigit(c)) { 571 // not word boundary 572 temp.append(c); 573 } else { 574 // word boundary 575 // add the last word if there was one 576 if (temp.length()>0) { 577 if (all_terms.contains(temp.toString())) { 578 //if there is anything already present in temp_content, add it as a text node 579 Text t = this.doc.createTextNode(temp_content.toString()); 580 new_content_elem.appendChild(t); 581 temp_content.delete(0, temp_content.length()); 582 Element annot = GSXML.createTextElement(this.doc, "annotation", temp.toString()); 583 annot.setAttribute("type", "query_term"); 584 new_content_elem.appendChild(annot); 585 //new_content.append("<annotation type='query_term'>"+temp+"</annotation>"); 586 } else { 587 temp_content.append(temp); 553 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER); 554 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 555 556 HashSet query_term_variants = new HashSet(); 557 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 558 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) { 559 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); 560 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); 561 for (int j = 0; j < equivalent_terms.length; j++) { 562 System.err.println("Adding query term variant: " + equivalent_terms[j]); 563 query_term_variants.add(equivalent_terms[j]); 564 } 565 } 566 567 ArrayList phrase_query_term_variants_hierarchy = new ArrayList(); 568 569 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 570 String performed_query = GSXML.getNodeText(query_element) + " "; 571 572 ArrayList phrase_query_p_term_variants_list = new ArrayList(); 573 int term_start = 0; 574 boolean in_term = false; 575 boolean in_phrase = false; 576 for (int i = 0; i < performed_query.length(); i++) { 577 char character = performed_query.charAt(i); 578 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 579 580 // Has a query term just started? 581 if (in_term == false && is_character_letter_or_digit == true) { 582 in_term = true; 583 term_start = i; 584 } 585 586 // Or has a term just finished? 587 else if (in_term == true && is_character_letter_or_digit == false) { 588 in_term = false; 589 String term = performed_query.substring(term_start, i); 590 System.err.println("Term: " + term); 591 592 HashSet phrase_query_p_term_x_variants = new HashSet(); 593 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 594 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 595 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) { 596 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); 597 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); 598 for (int k = 0; k < term_equivalent_terms.length; k++) { 599 System.err.println("Adding query term variant: " + term_equivalent_terms[k]); 600 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); 588 601 } 589 temp.delete(0, temp.length()); 590 } 591 if (c=='<') { 592 temp_content.append(c); 593 i++; 594 // skip over html 595 while (i<content.length() && content.charAt(i)!='>') { 596 temp_content.append(content.charAt(i)); 597 i++; 602 } 603 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 604 605 if (in_phrase == false) { 606 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 607 phrase_query_p_term_variants_list = new ArrayList(); 608 } 609 } 610 611 // Watch for phrases (surrounded by quotes) 612 if (character == '\"') { 613 // Has a phrase just started? 614 if (in_phrase == false) { 615 in_phrase = true; 616 } 617 // Or has a phrase just finished? 618 else if (in_phrase == true) { 619 in_phrase = false; 620 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 621 } 622 623 phrase_query_p_term_variants_list = new ArrayList(); 624 } 625 } 626 627 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy); 628 } 629 630 631 /** 632 * Highlights query terms in a piece of text. 633 */ 634 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy) 635 { 636 // Convert the content string to an array of characters for speed 637 char[] content_characters = new char[content.length()]; 638 content.getChars(0, content.length(), content_characters, 0); 639 640 // Now skim through the content, identifying word matches 641 ArrayList word_matches = new ArrayList(); 642 int word_start = 0; 643 boolean in_word = false; 644 boolean preceding_word_matched = false; 645 for (int i = 0; i < content_characters.length; i++) { 646 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]); 647 648 // Has a word just started? 649 if (in_word == false && is_character_letter_or_digit == true) { 650 in_word = true; 651 word_start = i; 652 } 653 654 // Or has a word just finished? 655 else if (in_word == true && is_character_letter_or_digit == false) { 656 in_word = false; 657 658 // Check if the word matches any of the query term equivalents 659 String word = new String(content_characters, word_start, (i - word_start)); 660 if (query_term_variants.contains(word)) { 661 // We have found a matching word, so remember its location 662 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 663 preceding_word_matched = true; 664 } 665 else { 666 preceding_word_matched = false; 667 } 668 } 669 } 670 671 // Don't forget the last word... 672 if (in_word == true) { 673 // Check if the word matches any of the query term equivalents 674 String word = new String(content_characters, word_start, (content_characters.length - word_start)); 675 if (query_term_variants.contains(word)) { 676 // We have found a matching word, so remember its location 677 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched)); 678 } 679 } 680 681 ArrayList highlight_start_positions = new ArrayList(); 682 ArrayList highlight_end_positions = new ArrayList(); 683 684 // Deal with phrases now 685 ArrayList partial_phrase_matches = new ArrayList(); 686 for (int i = 0; i < word_matches.size(); i++) { 687 WordMatch word_match = (WordMatch) word_matches.get(i); 688 689 // See if any partial phrase matches are extended by this word 690 if (word_match.preceding_word_matched) { 691 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) { 692 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j); 693 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number); 694 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched); 695 if (phrase_query_p_term_x_variants.contains(word_match.word)) { 696 partial_phrase_match.num_words_matched++; 697 698 // Has a complete phrase match occurred? 699 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) { 700 // Check for overlaps by looking at the previous highlight range 701 if (!highlight_end_positions.isEmpty()) { 702 int last_highlight_index = highlight_end_positions.size() - 1; 703 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue(); 704 if (last_highlight_end > partial_phrase_match.start_position) { 705 // There is an overlap, so remove the previous phrase match 706 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue(); 707 highlight_end_positions.remove(last_highlight_index); 708 partial_phrase_match.start_position = last_highlight_start; 709 } 710 } 711 712 highlight_start_positions.add(new Integer(partial_phrase_match.start_position)); 713 highlight_end_positions.add(new Integer(word_match.end_position)); 714 } 715 // No, but add the partial match back into the list for next time 716 else { 717 partial_phrase_matches.add(partial_phrase_match); 718 } 719 } 720 } 721 } 722 else { 723 partial_phrase_matches.clear(); 724 } 725 726 // See if this word is at the start of any of the phrases 727 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) { 728 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p); 729 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0); 730 if (phrase_query_p_term_1_variants.contains(word_match.word)) { 731 // If this phrase is just one word long, we have a complete match 732 if (phrase_query_p_term_variants_list.size() == 1) { 733 highlight_start_positions.add(new Integer(word_match.start_position)); 734 highlight_end_positions.add(new Integer(word_match.end_position)); 598 735 } 599 temp_content.append(content.charAt(i)); 600 //temp_content.append(GSXML.xmlSafe(temp.toString())); 601 //temp.delete(0, temp.length()); 602 603 } else { 604 temp_content.append(c); 605 } 606 } 607 } 608 // append anything left of temp_content and temp 609 Text t = this.doc.createTextNode(temp_content.toString()); 610 new_content_elem.appendChild(t); 611 612 if (temp.length() > 0) { 613 Element annot = GSXML.createTextElement(this.doc, "annotation", temp.toString()); 614 annot.setAttribute("type", "query_term"); 615 new_content_elem.appendChild(annot); 616 } 617 //String content_string = "<nodeContent>"+new_content.toString()+"</nodeContent>"; 618 //Element content_elem = this.converter.getDOM(content_string).getDocumentElement(); 619 return new_content_elem; 736 // Otherwise we have the start of a potential phrase match 737 else { 738 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p)); 739 } 740 } 741 } 742 } 743 744 // Now add the annotation tags into the document at the correct points 745 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM); 746 747 int last_wrote = 0; 748 for (int i = 0; i < highlight_start_positions.size(); i++) { 749 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue(); 750 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue(); 751 752 // Print anything before the highlight range 753 if (last_wrote < highlight_start) { 754 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote)); 755 // System.err.print(preceding_text); 756 content_element.appendChild(this.doc.createTextNode(preceding_text)); 757 } 758 759 // Print the highlight text, annotated 760 if (highlight_end > last_wrote) { 761 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start)); 762 // System.err.print("|" + highlight_text + "|"); 763 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text); 764 annotation_element.setAttribute("type", "query_term"); 765 content_element.appendChild(annotation_element); 766 last_wrote = highlight_end; 767 } 768 } 769 770 // Finish off any unwritten text 771 if (last_wrote < content_characters.length) { 772 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote)); 773 // System.err.print(remaining_text); 774 content_element.appendChild(this.doc.createTextNode(remaining_text)); 775 } 776 777 return content_element; 778 } 779 780 781 static private class WordMatch 782 { 783 public String word; 784 public int start_position; 785 public int end_position; 786 public boolean preceding_word_matched; 787 788 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched) 789 { 790 this.word = word; 791 this.start_position = start_position; 792 this.end_position = end_position; 793 this.preceding_word_matched = preceding_word_matched; 794 } 795 } 796 797 798 static private class PartialPhraseMatch 799 { 800 public int start_position; 801 public int query_phrase_number; 802 public int num_words_matched; 803 804 public PartialPhraseMatch(int start_position, int query_phrase_number) 805 { 806 this.start_position = start_position; 807 this.query_phrase_number = query_phrase_number; 808 this.num_words_matched = 1; 809 } 620 810 } 621 811 }
Note:
See TracChangeset
for help on using the changeset viewer.