Context Navigation

source: main/trunk/greenstone2/runtime-src/src/recpt/summarise.cpp@ 21810

Last change on this file since 21810 was 20805, checked in by davidb, 15 years ago
Extra checks added to summarise_keywords() to ensure that the htmlstr and query are non-trivial (i.e. not empty)
Property svn:keywords set to `Author Date Id Revision`
File size: 8.7 KB

Line
1	/**********************************************************************
2	*
3	* summarise.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	/* The function 'summarise' produces, given a document text and a query,
27	* a (query-biased) summary. In the future, several types of summaries will
28	* be supported.
29	*/
30
31	#include "summarise.h"
32	#include "unitool.h"
33	#include <string.h>
34
35
36	/* ************** LOCAL PROTOTYPES ************** */
37
38	text_t summarise_startend(text_t &htmlstr, int summaryLength);
39	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
40
41	text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
42	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
43	bool paragraph_tag(text_t::iterator start);
44
45
46	/****************************************************
47	NAME: summarise
48	DESC: produce a summary for a document
49	*****************************************************/
50
51	text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
52	// return summarise_startend(htmlstr,summaryLength);
53	return summarise_keywords(htmlstr,query,summaryLength);
54	}
55
56
57	/****************************************************
58	NAME: summarise_startend
59	DESC: return first and last sentences of a document
60	*****************************************************/
61
62	text_t summarise_startend(text_t &htmlstr, int summaryLength) {
63	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
64	text_t answer;
65
66	// add first sentences up to half the summary length
67
68	text_t::iterator str_current = str_start;
69	while(str_current<str_end && answer.size()<(summaryLength/2)) {
70	text_t sentence = next_sentence(str_current,str_end);
71	answer.append(sentence);
72	}
73
74	summaryLength -= answer.size(); // summary length left for last sentences
75	if(summaryLength<0)
76	summaryLength = 0;
77
78	str_end = str_current;
79	str_current = htmlstr.end()-1;
80	text_t lastSentence;
81	while(str_current>str_end &&
82	lastSentence.size()<summaryLength) {
83	text_t sentence = previous_sentence(str_current,str_end);
84	lastSentence = sentence + lastSentence;
85	}
86
87	answer += " ... ";
88	answer += lastSentence;
89
90	return answer;
91	}
92
93
94	/****************************************************
95	NAME: summarise_keywords
96	DESC: build a summary with sentences containing keyword(s).
97	the sentences with most matching keywords are returned
98	first.
99	*****************************************************/
100
101	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) {
102
103	if ((query.size()==0) \|\| (htmlstr.size()==0)) {
104	return "";
105	}
106
107	text_tarray allterms, terms;
108	splitchar(query.begin(),query.end(),' ',allterms);
109
110	// consider only non-empty terms
111	for(text_tarray::iterator term = allterms.begin();
112	term < allterms.end(); ++term) {
113	if(!(*term).empty())
114	terms.push_back(*term);
115	}
116
117	if (terms.size()==0) {
118	return "";
119	}
120
121	text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
122
123	//text_tarray::iterator terms_current = terms_start;
124	//strstr("merde",*terms_current);
125
126	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
127
128	vector<text_tarray> answers(terms.size());
129	// an array of array of sentences for the summary:
130	// answers[0] contains sentences with 1 keyword
131	// answers[1] contains sentences with 2 keywords, etc.
132	vector<int> answersSize(terms.size());
133	// answersSize[0] is the combined size of sentences with 1 keyword, etc.
134	for(vector<int>::iterator size = answersSize.begin();
135	size<answersSize.end(); ++size)
136	*size = 0; // initialise sentence size
137
138	int totfound = 0;
139	text_t::iterator str_current = str_start;
140	while(str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
141	// if the size of best sentences is greater than summary, that's enough!
142	text_t sentence = next_sentence(str_current,str_end);
143
144	text_tarray::iterator terms_current = terms_start;
145	int nFound = 0;
146	while(terms_current!=terms_end) {
147	text_t::iterator word = findword(sentence.begin(),sentence.end(),
148	*terms_current);
149	if(word!=sentence.end())
150	{ ++nFound; ++totfound; }
151	++terms_current;
152	}
153
154	if(nFound>0 && answersSize[nFound-1]<summaryLength) {
155	answers[nFound-1].push_back(sentence);
156	answersSize[nFound-1] += sentence.size();
157	}
158	}
159
160	text_t answer;
161	for(vector<text_tarray>::iterator sentarray = answers.end()-1;
162	sentarray>=answers.begin(); --sentarray)
163	for(text_tarray::iterator sentence = (*sentarray).begin();
164	sentence < (*sentarray).end(); ++sentence) {
165	answer.append(*sentence);
166	if(answer.size()>=summaryLength)
167	return answer;
168	}
169
170	if(!answer.empty())
171	return answer;
172
173	return summarise_startend(htmlstr,summaryLength);
174	}
175
176
177	/* ********************* LOCAL FUNCTIONS ***************** */
178
179	/* NAME: next_sentence
180	DESC: returns next sentence, text-only (ie. HTML markup is removed)
181	*/
182
183	text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
184	text_t sentence; // the sentence to be returned
185	bool foundPunctuation = false; // set to true by '.', '!' or '?'
186	while(start<end && !foundPunctuation) {
187	switch (*start) {
188	case '<': // skip over rest of html tag
189	if(paragraph_tag(start) && has_unicode_letdig(sentence))
190	foundPunctuation = true;
191	while ((start<end) && (*start!='>'))
192	++start;
193	if(start<end) ++start;
194	break;
195	case '.':
196	case '!':
197	case '?':
198	sentence.push_back(*start);
199	++start;
200	if(start>=end \|\|
201	(is_unicode_space(start) && ((start-2)<'A' \|\| *(start-2)>'Z'))) {
202	foundPunctuation = true;
203	}
204	break;
205	default:
206	sentence.push_back(*start);
207	++start;
208	break;
209	}
210	}
211	return sentence;
212	}
213
214
215	/* NAME: previous_sentence
216	DESC: returns previous sentence, text-only (ie. HTML markup is removed)
217	*/
218
219	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
220	text_t sentence; // the sentence to be returned
221	bool found1stPunctuation = false, // set to true by '.', '!' or '?'
222	// first punct. is included in results
223	found2ndPunctuation = false; // second punct. is stop condition,
224	// and is not included in results
225	while(start>end && !found2ndPunctuation) {
226	switch (*start) {
227	case '>': // skip over rest of html tag
228	while ((start>end) && (*start!='<')) // backtrack to beginning of tag
229	--start;
230	if(start>end) {
231	if(paragraph_tag(start) && has_unicode_letdig(sentence))
232	found2ndPunctuation = true;
233	--start;
234	}
235	break;
236	case '.':
237	case '!':
238	case '?':
239	if(!is_unicode_space(*(start+1)) \|\|
240	(start-1>end && (start-1)>='A' && (start-1)<='Z')) {
241	// if next character is not a blank, or preceding character is
242	// a capital letter, we guess it's an acronym (e.g. "U.S.A.")
243	sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
244	start,start+1);
245	--start;
246	} else
247	if(has_unicode_letdig(sentence) \|\| found1stPunctuation)
248	found2ndPunctuation = true;
249	else {
250	sentence.text_as_usvector().insert(
251	sentence.text_as_usvector().begin(),start,start+1);
252	--start;
253	found1stPunctuation = true;
254	}
255	break;
256	default:
257	sentence.text_as_usvector().insert(
258	sentence.text_as_usvector().begin(),start,start+1);
259	--start;
260	break;
261	}
262	}
263	return sentence;
264	}
265
266
267	// start is positioned on the '<'
268	bool paragraph_tag(text_t::iterator start) {
269	if(*start=='<') {
270	++start;
271	if(start=='p' \|\| start=='P') {
272	++start;
273	if(is_unicode_space(start) \|\| start=='>')
274	return true;
275	}
276	}
277	return false;
278	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: