source: main/tags/2.80/gsdl/src/recpt/summarise.cpp@ 24528

Last change on this file since 24528 was 9620, checked in by kjdon, 19 years ago

added some x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1/**********************************************************************
2 *
3 * summarise.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26/* The function 'summarise' produces, given a document text and a query,
27 * a (query-biased) summary. In the future, several types of summaries will
28 * be supported.
29 */
30
31#include "summarise.h"
32#include "unitool.h"
33#include <string.h>
34
35
36/* **************** LOCAL PROTOTYPES **************** */
37
38text_t summarise_startend(text_t &htmlstr, int summaryLength);
39text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
40
41text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
42text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
43bool paragraph_tag(text_t::iterator start);
44
45
46/****************************************************
47 NAME: summarise
48 DESC: produce a summary for a document
49*****************************************************/
50
51text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
52 // return summarise_startend(htmlstr,summaryLength);
53 return summarise_keywords(htmlstr,query,summaryLength);
54}
55
56
57/****************************************************
58 NAME: summarise_startend
59 DESC: return first and last sentences of a document
60*****************************************************/
61
62text_t summarise_startend(text_t &htmlstr, int summaryLength) {
63 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
64 text_t answer;
65
66 // add first sentences up to half the summary length
67
68 text_t::iterator str_current = str_start;
69 while(str_current<str_end && answer.size()<(summaryLength/2)) {
70 text_t sentence = next_sentence(str_current,str_end);
71 answer.append(sentence);
72 }
73
74 summaryLength -= answer.size(); // summary length left for last sentences
75 if(summaryLength<0)
76 summaryLength = 0;
77
78 str_end = str_current;
79 str_current = htmlstr.end()-1;
80 text_t lastSentence;
81 while(str_current>str_end &&
82 lastSentence.size()<summaryLength) {
83 text_t sentence = previous_sentence(str_current,str_end);
84 lastSentence = sentence + lastSentence;
85 }
86
87 answer += " ... ";
88 answer += lastSentence;
89
90 return answer;
91}
92
93
94/****************************************************
95 NAME: summarise_keywords
96 DESC: build a summary with sentences containing keyword(s).
97 the sentences with most matching keywords are returned
98 first.
99*****************************************************/
100
101text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) {
102
103 text_tarray allterms, terms;
104 splitchar(query.begin(),query.end(),' ',allterms);
105
106 // consider only non-empty terms
107 for(text_tarray::iterator term = allterms.begin();
108 term < allterms.end(); ++term) {
109 if(!(*term).empty())
110 terms.push_back(*term);
111 }
112
113 text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
114
115 //text_tarray::iterator terms_current = terms_start;
116 //strstr("merde",*terms_current);
117
118 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
119
120 vector<text_tarray> answers(terms.size());
121 // an array of array of sentences for the summary:
122 // answers[0] contains sentences with 1 keyword
123 // answers[1] contains sentences with 2 keywords, etc.
124 vector<int> answersSize(terms.size());
125 // answersSize[0] is the combined size of sentences with 1 keyword, etc.
126 for(vector<int>::iterator size = answersSize.begin();
127 size<answersSize.end(); ++size)
128 *size = 0; // initialise sentence size
129
130 int totfound = 0;
131 text_t::iterator str_current = str_start;
132 while(str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
133 // if the size of best sentences is greater than summary, that's enough!
134 text_t sentence = next_sentence(str_current,str_end);
135
136 text_tarray::iterator terms_current = terms_start;
137 int nFound = 0;
138 while(terms_current!=terms_end) {
139 text_t::iterator word = findword(sentence.begin(),sentence.end(),
140 *terms_current);
141 if(word!=sentence.end())
142 { ++nFound; ++totfound; }
143 ++terms_current;
144 }
145
146 if(nFound>0 && answersSize[nFound-1]<summaryLength) {
147 answers[nFound-1].push_back(sentence);
148 answersSize[nFound-1] += sentence.size();
149 }
150 }
151
152 text_t answer;
153 for(vector<text_tarray>::iterator sentarray = answers.end()-1;
154 sentarray>=answers.begin(); --sentarray)
155 for(text_tarray::iterator sentence = (*sentarray).begin();
156 sentence < (*sentarray).end(); ++sentence) {
157 answer.append(*sentence);
158 if(answer.size()>=summaryLength)
159 return answer;
160 }
161
162 if(!answer.empty())
163 return answer;
164
165 return summarise_startend(htmlstr,summaryLength);
166}
167
168
169/* *********************** LOCAL FUNCTIONS ******************* */
170
171/* NAME: next_sentence
172 DESC: returns next sentence, text-only (ie. HTML markup is removed)
173 */
174
175text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
176 text_t sentence; // the sentence to be returned
177 bool foundPunctuation = false; // set to true by '.', '!' or '?'
178 while(start<end && !foundPunctuation) {
179 switch (*start) {
180 case '<': // skip over rest of html tag
181 if(paragraph_tag(start) && has_unicode_letdig(sentence))
182 foundPunctuation = true;
183 while ((start<end) && (*start!='>'))
184 ++start;
185 if(start<end) ++start;
186 break;
187 case '.':
188 case '!':
189 case '?':
190 sentence.push_back(*start);
191 ++start;
192 if(start>=end ||
193 (is_unicode_space(*start) && (*(start-2)<'A' || *(start-2)>'Z'))) {
194 foundPunctuation = true;
195 }
196 break;
197 default:
198 sentence.push_back(*start);
199 ++start;
200 break;
201 }
202 }
203 return sentence;
204}
205
206
207/* NAME: previous_sentence
208 DESC: returns previous sentence, text-only (ie. HTML markup is removed)
209 */
210
211text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
212 text_t sentence; // the sentence to be returned
213 bool found1stPunctuation = false, // set to true by '.', '!' or '?'
214 // first punct. is included in results
215 found2ndPunctuation = false; // second punct. is stop condition,
216 // and is not included in results
217 while(start>end && !found2ndPunctuation) {
218 switch (*start) {
219 case '>': // skip over rest of html tag
220 while ((start>end) && (*start!='<')) // backtrack to beginning of tag
221 --start;
222 if(start>end) {
223 if(paragraph_tag(start) && has_unicode_letdig(sentence))
224 found2ndPunctuation = true;
225 --start;
226 }
227 break;
228 case '.':
229 case '!':
230 case '?':
231 if(!is_unicode_space(*(start+1)) ||
232 (start-1>end && *(start-1)>='A' && *(start-1)<='Z')) {
233 // if next character is not a blank, or preceding character is
234 // a capital letter, we guess it's an acronym (e.g. "U.S.A.")
235 sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
236 start,start+1);
237 --start;
238 } else
239 if(has_unicode_letdig(sentence) || found1stPunctuation)
240 found2ndPunctuation = true;
241 else {
242 sentence.text_as_usvector().insert(
243 sentence.text_as_usvector().begin(),start,start+1);
244 --start;
245 found1stPunctuation = true;
246 }
247 break;
248 default:
249 sentence.text_as_usvector().insert(
250 sentence.text_as_usvector().begin(),start,start+1);
251 --start;
252 break;
253 }
254 }
255 return sentence;
256}
257
258
259// start is positioned on the '<'
260bool paragraph_tag(text_t::iterator start) {
261 if(*start=='<') {
262 ++start;
263 if(*start=='p' || *start=='P') {
264 ++start;
265 if(is_unicode_space(*start) || *start=='>')
266 return true;
267 }
268 }
269 return false;
270}
Note: See TracBrowser for help on using the repository browser.