Context Navigation

source: trunk/gsdl/src/recpt/summarise.cpp@ 13017

Last change on this file since 13017 was 9620, checked in by kjdon, 19 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Rev	Line
[2967]	1	/**********************************************************************
	2	*
	3	* summarise.cpp --
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	*********************************************************************/
	25
	26	/* The function 'summarise' produces, given a document text and a query,
	27	* a (query-biased) summary. In the future, several types of summaries will
	28	* be supported.
	29	*/
	30
	31	#include "summarise.h"
[2971]	32	#include "unitool.h"
[2967]	33	#include <string.h>
	34
	35
	36	/* ************** LOCAL PROTOTYPES ************** */
	37
	38	text_t summarise_startend(text_t &htmlstr, int summaryLength);
	39	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
	40
	41	text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
	42	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
	43	bool paragraph_tag(text_t::iterator start);
	44
	45
	46	/****************************************************
	47	NAME: summarise
	48	DESC: produce a summary for a document
	49	*****************************************************/
	50
	51	text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
	52	// return summarise_startend(htmlstr,summaryLength);
	53	return summarise_keywords(htmlstr,query,summaryLength);
	54	}
	55
	56
	57	/****************************************************
	58	NAME: summarise_startend
	59	DESC: return first and last sentences of a document
	60	*****************************************************/
	61
	62	text_t summarise_startend(text_t &htmlstr, int summaryLength) {
	63	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
	64	text_t answer;
	65
	66	// add first sentences up to half the summary length
	67
	68	text_t::iterator str_current = str_start;
	69	while(str_current<str_end && answer.size()<(summaryLength/2)) {
	70	text_t sentence = next_sentence(str_current,str_end);
	71	answer.append(sentence);
	72	}
	73
	74	summaryLength -= answer.size(); // summary length left for last sentences
	75	if(summaryLength<0)
	76	summaryLength = 0;
	77
	78	str_end = str_current;
	79	str_current = htmlstr.end()-1;
	80	text_t lastSentence;
	81	while(str_current>str_end &&
	82	lastSentence.size()<summaryLength) {
	83	text_t sentence = previous_sentence(str_current,str_end);
	84	lastSentence = sentence + lastSentence;
	85	}
	86
	87	answer += " ... ";
	88	answer += lastSentence;
	89
	90	return answer;
	91	}
	92
	93
	94	/****************************************************
	95	NAME: summarise_keywords
	96	DESC: build a summary with sentences containing keyword(s).
	97	the sentences with most matching keywords are returned
	98	first.
	99	*****************************************************/
	100
	101	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) {
	102
	103	text_tarray allterms, terms;
	104	splitchar(query.begin(),query.end(),' ',allterms);
	105
	106	// consider only non-empty terms
	107	for(text_tarray::iterator term = allterms.begin();
[9620]	108	term < allterms.end(); ++term) {
[2967]	109	if(!(*term).empty())
	110	terms.push_back(*term);
	111	}
	112
	113	text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
	114
	115	//text_tarray::iterator terms_current = terms_start;
	116	//strstr("merde",*terms_current);
	117
	118	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
	119
	120	vector<text_tarray> answers(terms.size());
	121	// an array of array of sentences for the summary:
	122	// answers[0] contains sentences with 1 keyword
	123	// answers[1] contains sentences with 2 keywords, etc.
	124	vector<int> answersSize(terms.size());
	125	// answersSize[0] is the combined size of sentences with 1 keyword, etc.
	126	for(vector<int>::iterator size = answersSize.begin();
[9620]	127	size<answersSize.end(); ++size)
[2967]	128	*size = 0; // initialise sentence size
	129
	130	int totfound = 0;
	131	text_t::iterator str_current = str_start;
	132	while(str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
	133	// if the size of best sentences is greater than summary, that's enough!
	134	text_t sentence = next_sentence(str_current,str_end);
	135
	136	text_tarray::iterator terms_current = terms_start;
	137	int nFound = 0;
	138	while(terms_current!=terms_end) {
	139	text_t::iterator word = findword(sentence.begin(),sentence.end(),
	140	*terms_current);
	141	if(word!=sentence.end())
[9620]	142	{ ++nFound; ++totfound; }
	143	++terms_current;
[2967]	144	}
	145
	146	if(nFound>0 && answersSize[nFound-1]<summaryLength) {
	147	answers[nFound-1].push_back(sentence);
	148	answersSize[nFound-1] += sentence.size();
	149	}
	150	}
	151
	152	text_t answer;
	153	for(vector<text_tarray>::iterator sentarray = answers.end()-1;
[9620]	154	sentarray>=answers.begin(); --sentarray)
[2967]	155	for(text_tarray::iterator sentence = (*sentarray).begin();
[9620]	156	sentence < (*sentarray).end(); ++sentence) {
[2967]	157	answer.append(*sentence);
	158	if(answer.size()>=summaryLength)
	159	return answer;
	160	}
	161
	162	if(!answer.empty())
	163	return answer;
	164
	165	return summarise_startend(htmlstr,summaryLength);
	166	}
	167
	168
	169	/* ********************* LOCAL FUNCTIONS ***************** */
	170
	171	/* NAME: next_sentence
	172	DESC: returns next sentence, text-only (ie. HTML markup is removed)
	173	*/
	174
	175	text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
	176	text_t sentence; // the sentence to be returned
	177	bool foundPunctuation = false; // set to true by '.', '!' or '?'
	178	while(start<end && !foundPunctuation) {
	179	switch (*start) {
	180	case '<': // skip over rest of html tag
	181	if(paragraph_tag(start) && has_unicode_letdig(sentence))
	182	foundPunctuation = true;
	183	while ((start<end) && (*start!='>'))
[9620]	184	++start;
	185	if(start<end) ++start;
[2967]	186	break;
	187	case '.':
	188	case '!':
	189	case '?':
	190	sentence.push_back(*start);
[9620]	191	++start;
[2967]	192	if(start>=end \|\|
	193	(is_unicode_space(start) && ((start-2)<'A' \|\| *(start-2)>'Z'))) {
	194	foundPunctuation = true;
	195	}
	196	break;
	197	default:
	198	sentence.push_back(*start);
[9620]	199	++start;
[2967]	200	break;
	201	}
	202	}
	203	return sentence;
	204	}
	205
	206
	207	/* NAME: previous_sentence
	208	DESC: returns previous sentence, text-only (ie. HTML markup is removed)
	209	*/
	210
	211	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
	212	text_t sentence; // the sentence to be returned
	213	bool found1stPunctuation = false, // set to true by '.', '!' or '?'
	214	// first punct. is included in results
	215	found2ndPunctuation = false; // second punct. is stop condition,
	216	// and is not included in results
	217	while(start>end && !found2ndPunctuation) {
	218	switch (*start) {
	219	case '>': // skip over rest of html tag
	220	while ((start>end) && (*start!='<')) // backtrack to beginning of tag
[9620]	221	--start;
[2967]	222	if(start>end) {
	223	if(paragraph_tag(start) && has_unicode_letdig(sentence))
	224	found2ndPunctuation = true;
[9620]	225	--start;
[2967]	226	}
	227	break;
	228	case '.':
	229	case '!':
	230	case '?':
	231	if(!is_unicode_space(*(start+1)) \|\|
	232	(start-1>end && (start-1)>='A' && (start-1)<='Z')) {
	233	// if next character is not a blank, or preceding character is
	234	// a capital letter, we guess it's an acronym (e.g. "U.S.A.")
	235	sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
	236	start,start+1);
[9620]	237	--start;
[2967]	238	} else
	239	if(has_unicode_letdig(sentence) \|\| found1stPunctuation)
	240	found2ndPunctuation = true;
	241	else {
	242	sentence.text_as_usvector().insert(
	243	sentence.text_as_usvector().begin(),start,start+1);
[9620]	244	--start;
[2967]	245	found1stPunctuation = true;
	246	}
	247	break;
	248	default:
	249	sentence.text_as_usvector().insert(
	250	sentence.text_as_usvector().begin(),start,start+1);
[9620]	251	--start;
[2967]	252	break;
	253	}
	254	}
	255	return sentence;
	256	}
	257
	258
	259	// start is positioned on the '<'
	260	bool paragraph_tag(text_t::iterator start) {
	261	if(*start=='<') {
[9620]	262	++start;
[2967]	263	if(start=='p' \|\| start=='P') {
[9620]	264	++start;
[2967]	265	if(is_unicode_space(start) \|\| start=='>')
	266	return true;
	267	}
	268	}
	269	return false;
	270	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: