source: tags/ant-install-branch-merged-1/gsdl3/src/java/org/greenstone/gsdl3/service/GoogleSearch.java.tmp@ 9873

Last change on this file since 9873 was 9873, checked in by (none), 19 years ago

This commit was manufactured by cvs2svn to create tag
'ant-install-branch-merged-1'.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6//Google Web Services API classes
7//import com.google.soap.search.GoogleSearch;
8import com.google.soap.search.GoogleSearchFault;
9import com.google.soap.search.GoogleSearchResult;
10import com.google.soap.search.GoogleSearchResultElement;
11import com.google.soap.search.GoogleSearchDirectoryCategory;
12
13// XML classes
14import org.w3c.dom.Element;
15import org.w3c.dom.Document;
16import org.w3c.dom.NodeList;
17
18//Java classes
19import java.util.ArrayList;
20import java.util.HashMap;
21import java.io.File;
22import java.io.InputStream;
23import java.io.BufferedReader;
24import java.io.InputStreamReader;
25import java.io.IOException;
26import java.net.HttpURLConnection;
27import java.net.URLConnection;
28import java.net.URL;
29import java.net.Authenticator;
30import java.net.PasswordAuthentication;
31import java.net.MalformedURLException;
32import java.lang.Object;
33
34
35/**
36 *
37 * @author <a href="mailto:[email protected]">Chi-Yu Huang</a>
38 * @version $Revision: 9873 $
39 *
40 */
41
42public class GoogleSearch
43 extends AbstractSearch {
44
45 //Parameters connect to Proxy Server
46 private boolean using_proxy = false;
47 private String proxy_host = null;
48 private int proxy_port;
49 private String proxy_user = null;
50 private char [] proxy_passwd = null;
51
52 // google key
53 private String client_key = null;
54
55 public GoogleSearch()
56 {
57 }
58
59 //Configure GoogleSearch Service
60 public boolean configure(Element info, Element extra_info)
61 {
62 if (!super.configure(info, extra_info)) {
63 return false;
64 }
65
66 Element server_elem = (Element)GSXML.getChildByTagName(info, "googleServer");
67 if (server_elem == null) {
68 System.err.println("GoogleSearch.configure error: no googleServer element found");
69 return false;
70 }
71
72 client_key = server_elem.getAttribute("key");
73 if (client_key.equals("")) {
74 System.err.println("GoogleSearch.configure error: no client_key for the googleServer element");
75 return false;
76 }
77
78 does_paging = true;
79 does_chunking = false;
80
81 // are we behind a proxy??
82 // all the details should have been set up by the Message Router
83 proxy_host = System.getProperty("http.proxyHost");
84 if (proxy_host != null && !proxy_host.equals("")) {
85 using_proxy = true;
86 try {
87 proxy_port = Integer.parseInt(System.getProperty("http.proxyPort").trim());
88 } catch (Exception e) {
89 System.err.println("GoogleSearch.configure error: couldn't get proxy port, defaulting to 80");
90 proxy_port = 80;
91 }
92 PasswordAuthentication pa = Authenticator.requestPasswordAuthentication(proxy_host, null, proxy_port, "http", "", null);
93 proxy_user = pa.getUserName();
94 proxy_passwd = pa.getPassword();
95 }
96 return true;
97 }
98
99 /** Process a text query - implemented by concrete subclasses */
100 protected Element processTextQuery(Element request) {
101
102 //Connect to Google Web API service
103 com.google.soap.search.GoogleSearch search = new com.google.soap.search.GoogleSearch();
104
105 // Set mandatory attributes
106 search.setKey(client_key);
107
108 // proxy??
109 if (using_proxy) {
110 search.setProxyHost(proxy_host);
111 search.setProxyPort(proxy_port);
112 search.setProxyUserName(proxy_user);
113 search.setProxyPassword(new String(proxy_passwd));
114 }
115
116 //set optional attributes
117 search.setSafeSearch(true);
118
119 // Create a new (empty) result message
120 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
121 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
122 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
123 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
124 result.appendChild(doc_node_list);
125 Element query_metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
126 result.appendChild(query_metadata_list);
127
128 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
129 if (param_list == null) {
130 System.err.println("GoogleSearch.processTextQuery Error:: TextQuery request had no paramList.");
131 return result; // Return the empty result
132 }
133
134 // Process the request parameters
135 HashMap params = GSXML.extractParams(param_list, false);
136
137 // Make sure a query has been specified
138 String query = (String) params.get(QUERY_PARAM);
139 if (query == null || query.equals("")) {
140 System.err.println("GoogleSearch.processTextQuery Error: TextQuery request had no query string.");
141 return result; // Return the empty result
142 }
143 // tidy whitespace
144 query = query.replaceAll("\\s+", "+");
145
146 search.setQueryString(query);
147
148 //Check hits_per_page
149 int hits_per_page;
150 try {
151 hits_per_page = Integer.parseInt(((String)params.get(HITS_PER_PAGE_PARAM)).trim());
152 } catch (Exception e) {
153 System.err.println("GoogleSearch.processTextQuery error: couldn't get hits per page param, defaulting to 10");
154 hits_per_page = 10;
155 }
156 //Check the start_page number
157 int start_page;
158 try {
159 start_page = Integer.parseInt(((String) params.get(START_PAGE_PARAM)).trim());
160 } catch (Exception e) {
161 System.err.println("GoogleSearch.processTextQuery error: couldn't get start page param, defaulting to 1");
162 start_page = 1;
163 }
164
165 //Invoke Actual Search
166
167 // Google only allows 10 hits per request
168 int loop = hits_per_page/10;
169 int remainder = hits_per_page%10;
170 int google_start_page = (start_page-1)*hits_per_page;
171 int pages_per_loop;
172 for (int j=0; j < loop; j++){
173 if (j < (loop-1) || remainder == 0) {
174 pages_per_loop = 10;
175 } else {
176 pages_per_loop = remainder;
177 }
178 search.setMaxResults(pages_per_loop);
179 search.setStartResult(google_start_page);
180 google_start_page = google_start_page + pages_per_loop;
181 GoogleSearchResult google_result;
182 try{
183 google_result = search.doSearch();
184 } catch (GoogleSearchFault ex) {
185 System.err.println("GoogleSearch.processTextQuery error: the call to the Google Web APIs failed:" + ex.toString());
186 // add the error to the result
187 return result;
188 }
189 if (j==0) {
190
191 //Total amount of documents Google Search returned
192 // only need to do this on the first loop
193
194 long numdocs_matched = google_result.getEstimatedTotalResultsCount();
195 GSXML.addMetadata(this.doc, query_metadata_list, "numDocsMatched", ""+numdocs_matched);
196
197 }
198 GoogleSearchResultElement[] details = google_result.getResultElements();
199 for (int i=0; i<details.length; i++){
200 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
201 doc_node_list.appendChild(doc_node);
202
203 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
204 doc_node.appendChild(metadata_list);
205
206 String google_url = details[i].getURL();
207 String google_title = details[i].getTitle();
208 String google_snippet = details[i].getSnippet();
209 if (google_url !=null) {
210 GSXML.addMetadata(this.doc, metadata_list, "URL", google_url);
211 }
212 if (google_title != null) {
213 GSXML.addMetadata(this.doc, metadata_list, "Title", google_title);
214 }
215 if (google_snippet != null) {
216 GSXML.addMetadata(this.doc, metadata_list, "Snippet", google_snippet);
217 }
218 }
219 } // for each loop
220 return result;
221 }
222
223 protected void getIndexData(ArrayList index_ids, ArrayList index_names,String lang){
224 index_ids.add("idx");
225 //index_names.add(getTextString("param."+FIELD_PARAM+".kw", lang));
226 index_names.add("Google main index");
227 }
228
229}
Note: See TracBrowser for help on using the repository browser.