Context Navigation

source: main/tags/2.80/gsdl/src/recpt/summarise.cpp@ 24528

Last change on this file since 24528 was 9620, checked in by kjdon, 19 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	/**********************************************************************
2	*
3	* summarise.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	/* The function 'summarise' produces, given a document text and a query,
27	* a (query-biased) summary. In the future, several types of summaries will
28	* be supported.
29	*/
30
31	#include "summarise.h"
32	#include "unitool.h"
33	#include <string.h>
34
35
36	/* ************** LOCAL PROTOTYPES ************** */
37
38	text_t summarise_startend(text_t &htmlstr, int summaryLength);
39	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
40
41	text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
42	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
43	bool paragraph_tag(text_t::iterator start);
44
45
46	/****************************************************
47	NAME: summarise
48	DESC: produce a summary for a document
49	*****************************************************/
50
51	text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
52	// return summarise_startend(htmlstr,summaryLength);
53	return summarise_keywords(htmlstr,query,summaryLength);
54	}
55
56
57	/****************************************************
58	NAME: summarise_startend
59	DESC: return first and last sentences of a document
60	*****************************************************/
61
62	text_t summarise_startend(text_t &htmlstr, int summaryLength) {
63	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
64	text_t answer;
65
66	// add first sentences up to half the summary length
67
68	text_t::iterator str_current = str_start;
69	while(str_current<str_end && answer.size()<(summaryLength/2)) {
70	text_t sentence = next_sentence(str_current,str_end);
71	answer.append(sentence);
72	}
73
74	summaryLength -= answer.size(); // summary length left for last sentences
75	if(summaryLength<0)
76	summaryLength = 0;
77
78	str_end = str_current;
79	str_current = htmlstr.end()-1;
80	text_t lastSentence;
81	while(str_current>str_end &&
82	lastSentence.size()<summaryLength) {
83	text_t sentence = previous_sentence(str_current,str_end);
84	lastSentence = sentence + lastSentence;
85	}
86
87	answer += " ... ";
88	answer += lastSentence;
89
90	return answer;
91	}
92
93
94	/****************************************************
95	NAME: summarise_keywords
96	DESC: build a summary with sentences containing keyword(s).
97	the sentences with most matching keywords are returned
98	first.
99	*****************************************************/
100
101	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) {
102
103	text_tarray allterms, terms;
104	splitchar(query.begin(),query.end(),' ',allterms);
105
106	// consider only non-empty terms
107	for(text_tarray::iterator term = allterms.begin();
108	term < allterms.end(); ++term) {
109	if(!(*term).empty())
110	terms.push_back(*term);
111	}
112
113	text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
114
115	//text_tarray::iterator terms_current = terms_start;
116	//strstr("merde",*terms_current);
117
118	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
119
120	vector<text_tarray> answers(terms.size());
121	// an array of array of sentences for the summary:
122	// answers[0] contains sentences with 1 keyword
123	// answers[1] contains sentences with 2 keywords, etc.
124	vector<int> answersSize(terms.size());
125	// answersSize[0] is the combined size of sentences with 1 keyword, etc.
126	for(vector<int>::iterator size = answersSize.begin();
127	size<answersSize.end(); ++size)
128	*size = 0; // initialise sentence size
129
130	int totfound = 0;
131	text_t::iterator str_current = str_start;
132	while(str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
133	// if the size of best sentences is greater than summary, that's enough!
134	text_t sentence = next_sentence(str_current,str_end);
135
136	text_tarray::iterator terms_current = terms_start;
137	int nFound = 0;
138	while(terms_current!=terms_end) {
139	text_t::iterator word = findword(sentence.begin(),sentence.end(),
140	*terms_current);
141	if(word!=sentence.end())
142	{ ++nFound; ++totfound; }
143	++terms_current;
144	}
145
146	if(nFound>0 && answersSize[nFound-1]<summaryLength) {
147	answers[nFound-1].push_back(sentence);
148	answersSize[nFound-1] += sentence.size();
149	}
150	}
151
152	text_t answer;
153	for(vector<text_tarray>::iterator sentarray = answers.end()-1;
154	sentarray>=answers.begin(); --sentarray)
155	for(text_tarray::iterator sentence = (*sentarray).begin();
156	sentence < (*sentarray).end(); ++sentence) {
157	answer.append(*sentence);
158	if(answer.size()>=summaryLength)
159	return answer;
160	}
161
162	if(!answer.empty())
163	return answer;
164
165	return summarise_startend(htmlstr,summaryLength);
166	}
167
168
169	/* ********************* LOCAL FUNCTIONS ***************** */
170
171	/* NAME: next_sentence
172	DESC: returns next sentence, text-only (ie. HTML markup is removed)
173	*/
174
175	text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
176	text_t sentence; // the sentence to be returned
177	bool foundPunctuation = false; // set to true by '.', '!' or '?'
178	while(start<end && !foundPunctuation) {
179	switch (*start) {
180	case '<': // skip over rest of html tag
181	if(paragraph_tag(start) && has_unicode_letdig(sentence))
182	foundPunctuation = true;
183	while ((start<end) && (*start!='>'))
184	++start;
185	if(start<end) ++start;
186	break;
187	case '.':
188	case '!':
189	case '?':
190	sentence.push_back(*start);
191	++start;
192	if(start>=end \|\|
193	(is_unicode_space(start) && ((start-2)<'A' \|\| *(start-2)>'Z'))) {
194	foundPunctuation = true;
195	}
196	break;
197	default:
198	sentence.push_back(*start);
199	++start;
200	break;
201	}
202	}
203	return sentence;
204	}
205
206
207	/* NAME: previous_sentence
208	DESC: returns previous sentence, text-only (ie. HTML markup is removed)
209	*/
210
211	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
212	text_t sentence; // the sentence to be returned
213	bool found1stPunctuation = false, // set to true by '.', '!' or '?'
214	// first punct. is included in results
215	found2ndPunctuation = false; // second punct. is stop condition,
216	// and is not included in results
217	while(start>end && !found2ndPunctuation) {
218	switch (*start) {
219	case '>': // skip over rest of html tag
220	while ((start>end) && (*start!='<')) // backtrack to beginning of tag
221	--start;
222	if(start>end) {
223	if(paragraph_tag(start) && has_unicode_letdig(sentence))
224	found2ndPunctuation = true;
225	--start;
226	}
227	break;
228	case '.':
229	case '!':
230	case '?':
231	if(!is_unicode_space(*(start+1)) \|\|
232	(start-1>end && (start-1)>='A' && (start-1)<='Z')) {
233	// if next character is not a blank, or preceding character is
234	// a capital letter, we guess it's an acronym (e.g. "U.S.A.")
235	sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
236	start,start+1);
237	--start;
238	} else
239	if(has_unicode_letdig(sentence) \|\| found1stPunctuation)
240	found2ndPunctuation = true;
241	else {
242	sentence.text_as_usvector().insert(
243	sentence.text_as_usvector().begin(),start,start+1);
244	--start;
245	found1stPunctuation = true;
246	}
247	break;
248	default:
249	sentence.text_as_usvector().insert(
250	sentence.text_as_usvector().begin(),start,start+1);
251	--start;
252	break;
253	}
254	}
255	return sentence;
256	}
257
258
259	// start is positioned on the '<'
260	bool paragraph_tag(text_t::iterator start) {
261	if(*start=='<') {
262	++start;
263	if(start=='p' \|\| start=='P') {
264	++start;
265	if(is_unicode_space(start) \|\| start=='>')
266	return true;
267	}
268	}
269	return false;
270	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: