source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GoogleSearch.java.tmp@ 28966

Last change on this file since 28966 was 28966, checked in by kjdon, 10 years ago

Lots of changes. Mainly to do with removing this.doc from everywhere. Document is not thread safe. Now we tend to create a new Document everytime we are starting a new page/message etc. in service this.desc_doc is available as teh document to create service info stuff. But it should only be used for this and not for other messages. newDOM is now static for XMLConverter. method param changes for some GSXML methods.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6//Google Web Services API classes
7//import com.google.soap.search.GoogleSearch;
8import com.google.soap.search.GoogleSearchFault;
9import com.google.soap.search.GoogleSearchResult;
10import com.google.soap.search.GoogleSearchResultElement;
11import com.google.soap.search.GoogleSearchDirectoryCategory;
12
13// XML classes
14import org.w3c.dom.Element;
15import org.w3c.dom.Document;
16import org.w3c.dom.NodeList;
17
18//Java classes
19import java.util.ArrayList;
20import java.util.HashMap;
21import java.io.File;
22import java.io.InputStream;
23import java.io.BufferedReader;
24import java.io.InputStreamReader;
25import java.io.IOException;
26import java.net.HttpURLConnection;
27import java.net.URLConnection;
28import java.net.URL;
29import java.net.Authenticator;
30import java.net.PasswordAuthentication;
31import java.net.MalformedURLException;
32import java.lang.Object;
33
34import org.apache.log4j.*;
35
36/**
37 *
38 * @author <a href="mailto:[email protected]">Chi-Yu Huang</a>
39 *
40 */
41
42public class GoogleSearch
43 extends AbstractSearch {
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GoogleSearch.class.getName());
46
47 //Parameters connect to Proxy Server
48 private boolean using_proxy = false;
49 private String proxy_host = null;
50 private int proxy_port;
51 private String proxy_user = null;
52 private char [] proxy_passwd = null;
53
54 // google key
55 private String client_key = null;
56
57 public GoogleSearch()
58 {
59 }
60
61 //Configure GoogleSearch Service
62 public boolean configure(Element info, Element extra_info)
63 {
64 if (!super.configure(info, extra_info)) {
65 return false;
66 }
67 logger.info("Configuring GoogleSearch");
68 Element server_elem = (Element)GSXML.getChildByTagName(info, "googleServer");
69 if (server_elem == null) {
70 logger.error("no googleServer element found");
71 return false;
72 }
73
74 client_key = server_elem.getAttribute("key");
75 if (client_key.equals("")) {
76 logger.error("no client_key for the googleServer element");
77 return false;
78 }
79
80 does_paging = true;
81 does_chunking = false;
82
83 // are we behind a proxy??
84 // all the details should have been set up by the Message Router
85 proxy_host = System.getProperty("http.proxyHost");
86 if (proxy_host != null && !proxy_host.equals("")) {
87 using_proxy = true;
88 try {
89 proxy_port = Integer.parseInt(System.getProperty("http.proxyPort").trim());
90 } catch (Exception e) {
91 logger.error("couldn't get proxy port, defaulting to 80");
92 proxy_port = 80;
93 }
94 PasswordAuthentication pa = Authenticator.requestPasswordAuthentication(proxy_host, null, proxy_port, "http", "", null);
95 proxy_user = pa.getUserName();
96 proxy_passwd = pa.getPassword();
97 }
98 return true;
99 }
100
101 /** Process a text query - implemented by concrete subclasses */
102 protected Element processTextQuery(Element request) {
103
104 //Connect to Google Web API service
105 com.google.soap.search.GoogleSearch search = new com.google.soap.search.GoogleSearch();
106
107 // Set mandatory attributes
108 search.setKey(client_key);
109
110 // proxy??
111 if (using_proxy) {
112 search.setProxyHost(proxy_host);
113 search.setProxyPort(proxy_port);
114 search.setProxyUserName(proxy_user);
115 search.setProxyPassword(new String(proxy_passwd));
116 }
117
118 //set optional attributes
119 search.setSafeSearch(true);
120
121 // Create a new (empty) result message
122 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
123 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
124 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
125 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
126 result.appendChild(doc_node_list);
127 Element query_metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
128 result.appendChild(query_metadata_list);
129
130 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
131 if (param_list == null) {
132 logger.error("TextQuery request had no paramList.");
133 return result; // Return the empty result
134 }
135
136 // Process the request parameters
137 HashMap params = GSXML.extractParams(param_list, false);
138
139 // Make sure a query has been specified
140 String query = (String) params.get(QUERY_PARAM);
141 if (query == null || query.equals("")) {
142 logger.error("TextQuery request had no query string.");
143 return result; // Return the empty result
144 }
145 // tidy whitespace
146 query = query.replaceAll("\\s+", "+");
147
148 search.setQueryString(query);
149
150 //Check hits_per_page
151 int hits_per_page;
152 try {
153 hits_per_page = Integer.parseInt(((String)params.get(HITS_PER_PAGE_PARAM)).trim());
154 } catch (Exception e) {
155 logger.error("couldn't get hits per page param, defaulting to 10");
156 hits_per_page = 10;
157 }
158 //Check the start_page number
159 int start_page;
160 try {
161 start_page = Integer.parseInt(((String) params.get(START_PAGE_PARAM)).trim());
162 } catch (Exception e) {
163 logger.error("couldn't get start page param, defaulting to 1");
164 start_page = 1;
165 }
166
167 //Invoke Actual Search
168
169 // Google only allows 10 hits per request
170 int loop = hits_per_page/10;
171 int remainder = hits_per_page%10;
172 int google_start_page = (start_page-1)*hits_per_page;
173 int pages_per_loop;
174 for (int j=0; j < loop; j++){
175 if (j < (loop-1) || remainder == 0) {
176 pages_per_loop = 10;
177 } else {
178 pages_per_loop = remainder;
179 }
180 search.setMaxResults(pages_per_loop);
181 search.setStartResult(google_start_page);
182 google_start_page = google_start_page + pages_per_loop;
183 GoogleSearchResult google_result;
184 try{
185 google_result = search.doSearch();
186 } catch (GoogleSearchFault ex) {
187 logger.error("the call to the Google Web APIs failed:" + ex.toString());
188 // add the error to the result
189 return result;
190 }
191 if (j==0) {
192
193 //Total amount of documents Google Search returned
194 // only need to do this on the first loop
195
196 long numdocs_matched = google_result.getEstimatedTotalResultsCount();
197 GSXML.addMetadata(query_metadata_list, "numDocsMatched", ""+numdocs_matched);
198
199 }
200 GoogleSearchResultElement[] details = google_result.getResultElements();
201 for (int i=0; i<details.length; i++){
202 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
203 doc_node_list.appendChild(doc_node);
204
205 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
206 doc_node.appendChild(metadata_list);
207
208 String google_url = details[i].getURL();
209 String google_title = details[i].getTitle();
210 String google_snippet = details[i].getSnippet();
211 if (google_url !=null) {
212 GSXML.addMetadata(metadata_list, "URL", google_url);
213 }
214 if (google_title != null) {
215 GSXML.addMetadata(metadata_list, "Title", google_title);
216 }
217 if (google_snippet != null) {
218 GSXML.addMetadata(metadata_list, "Snippet", google_snippet);
219 }
220 }
221 } // for each loop
222 return result;
223 }
224
225 protected void getIndexData(ArrayList index_ids, ArrayList index_names,String lang){
226 index_ids.add("idx");
227 index_names.add("Google main index");
228 }
229
230}
Note: See TracBrowser for help on using the repository browser.