/********************************************************************** * * summarise.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ /* The function 'summarise' produces, given a document text and a query, * a (query-biased) summary. In the future, several types of summaries will * be supported. */ #include "summarise.h" #include "unitool.h" #include /* **************** LOCAL PROTOTYPES **************** */ text_t summarise_startend(text_t &htmlstr, int summaryLength); text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength); text_t next_sentence(text_t::iterator& start, text_t::iterator& end); text_t previous_sentence(text_t::iterator& start, text_t::iterator& end); bool paragraph_tag(text_t::iterator start); /**************************************************** NAME: summarise DESC: produce a summary for a document *****************************************************/ text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) { // return summarise_startend(htmlstr,summaryLength); return summarise_keywords(htmlstr,query,summaryLength); } /**************************************************** NAME: summarise_startend DESC: return first and last sentences of a document *****************************************************/ text_t summarise_startend(text_t &htmlstr, int summaryLength) { text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end(); text_t answer; // add first sentences up to half the summary length text_t::iterator str_current = str_start; while(str_currentstr_end && lastSentence.size() answers(terms.size()); // an array of array of sentences for the summary: // answers[0] contains sentences with 1 keyword // answers[1] contains sentences with 2 keywords, etc. vector answersSize(terms.size()); // answersSize[0] is the combined size of sentences with 1 keyword, etc. for(vector::iterator size = answersSize.begin(); size0 && answersSize[nFound-1]::iterator sentarray = answers.end()-1; sentarray>=answers.begin(); --sentarray) for(text_tarray::iterator sentence = (*sentarray).begin(); sentence < (*sentarray).end(); ++sentence) { answer.append(*sentence); if(answer.size()>=summaryLength) return answer; } if(!answer.empty()) return answer; return summarise_startend(htmlstr,summaryLength); } /* *********************** LOCAL FUNCTIONS ******************* */ /* NAME: next_sentence DESC: returns next sentence, text-only (ie. HTML markup is removed) */ text_t next_sentence(text_t::iterator& start, text_t::iterator& end) { text_t sentence; // the sentence to be returned bool foundPunctuation = false; // set to true by '.', '!' or '?' while(start=end || (is_unicode_space(*start) && (*(start-2)<'A' || *(start-2)>'Z'))) { foundPunctuation = true; } break; default: sentence.push_back(*start); ++start; break; } } return sentence; } /* NAME: previous_sentence DESC: returns previous sentence, text-only (ie. HTML markup is removed) */ text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) { text_t sentence; // the sentence to be returned bool found1stPunctuation = false, // set to true by '.', '!' or '?' // first punct. is included in results found2ndPunctuation = false; // second punct. is stop condition, // and is not included in results while(start>end && !found2ndPunctuation) { switch (*start) { case '>': // skip over rest of html tag while ((start>end) && (*start!='<')) // backtrack to beginning of tag --start; if(start>end) { if(paragraph_tag(start) && has_unicode_letdig(sentence)) found2ndPunctuation = true; --start; } break; case '.': case '!': case '?': if(!is_unicode_space(*(start+1)) || (start-1>end && *(start-1)>='A' && *(start-1)<='Z')) { // if next character is not a blank, or preceding character is // a capital letter, we guess it's an acronym (e.g. "U.S.A.") sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(), start,start+1); --start; } else if(has_unicode_letdig(sentence) || found1stPunctuation) found2ndPunctuation = true; else { sentence.text_as_usvector().insert( sentence.text_as_usvector().begin(),start,start+1); --start; found1stPunctuation = true; } break; default: sentence.text_as_usvector().insert( sentence.text_as_usvector().begin(),start,start+1); --start; break; } } return sentence; } // start is positioned on the '<' bool paragraph_tag(text_t::iterator start) { if(*start=='<') { ++start; if(*start=='p' || *start=='P') { ++start; if(is_unicode_space(*start) || *start=='>') return true; } } return false; }