1 |
|
---|
2 | import java.util.zip.*;
|
---|
3 | import java.io.BufferedOutputStream;
|
---|
4 | import java.io.BufferedReader;
|
---|
5 | import java.io.BufferedWriter;
|
---|
6 | import java.io.File;
|
---|
7 | import java.io.FileInputStream;
|
---|
8 | import java.io.FileNotFoundException;
|
---|
9 | import java.io.FileOutputStream;
|
---|
10 | import java.io.FileReader;
|
---|
11 | import java.io.IOException;
|
---|
12 | import java.io.InputStreamReader;
|
---|
13 | import java.io.ObjectOutputStream;
|
---|
14 | import java.io.OutputStreamWriter;
|
---|
15 | import java.net.Socket;
|
---|
16 | import java.nio.charset.Charset;
|
---|
17 | import java.nio.charset.StandardCharsets;
|
---|
18 | import java.nio.file.Files;
|
---|
19 | import java.nio.file.Paths;
|
---|
20 | import java.util.ArrayList;
|
---|
21 | import java.util.HashMap;
|
---|
22 | import java.util.LinkedList;
|
---|
23 | import java.util.List;
|
---|
24 |
|
---|
25 |
|
---|
26 |
|
---|
27 | public class CapiscoTopicToSynonyms {
|
---|
28 |
|
---|
29 | private static final int BUFFER_SIZE = 4096;
|
---|
30 |
|
---|
31 | private static HashMap<String,List<String>> TopicsToSynonyms = new HashMap<String,List<String>>();
|
---|
32 |
|
---|
33 | public static void main(String[] args) {
|
---|
34 |
|
---|
35 |
|
---|
36 | if ((args.length==0) || (args.length!=3) || (args[0].endsWith("help"))) {
|
---|
37 | System.err.println();
|
---|
38 | System.err.println("Usage: CapiscoTopicToSynonyms 'server URL' 'server port' 'input documents path'");
|
---|
39 | System.err.println();
|
---|
40 | System.exit(-1);
|
---|
41 | }
|
---|
42 |
|
---|
43 | String serverName = args[0];
|
---|
44 | int port = Integer.parseInt(args[1]);
|
---|
45 | String documentspath = args[2];
|
---|
46 |
|
---|
47 | try {
|
---|
48 | System.out.println("Connecting to " + serverName
|
---|
49 | + " on port " + port);
|
---|
50 | Socket client = new Socket(serverName, port);
|
---|
51 | System.out.println("Just connected to "
|
---|
52 | + client.getRemoteSocketAddress());
|
---|
53 | BufferedReader in = new BufferedReader(new InputStreamReader(client.getInputStream()));
|
---|
54 | BufferedWriter out= new BufferedWriter(new OutputStreamWriter(client.getOutputStream()));
|
---|
55 | File documents = new File(documentspath);
|
---|
56 |
|
---|
57 | unzipAll(documents);
|
---|
58 | processFiles(documents, in, out);
|
---|
59 |
|
---|
60 |
|
---|
61 | serializeHashmap("topics-to-synonyms.ser");
|
---|
62 |
|
---|
63 | client.close();
|
---|
64 |
|
---|
65 | }
|
---|
66 | catch(IOException e) {
|
---|
67 | System.out.println("error: "+ e.getMessage());
|
---|
68 | e.printStackTrace();
|
---|
69 | }
|
---|
70 | }
|
---|
71 |
|
---|
72 | private static void unzipAll(File file) throws IOException {
|
---|
73 |
|
---|
74 | if (file.isDirectory())
|
---|
75 | {
|
---|
76 | for (File f : file.listFiles())
|
---|
77 | {
|
---|
78 | unzipAll(f);
|
---|
79 | }
|
---|
80 | }
|
---|
81 | else
|
---|
82 | {
|
---|
83 | String filename = file.getName().toLowerCase();
|
---|
84 | if(filename.endsWith(".zip"))
|
---|
85 | {
|
---|
86 | ZipInputStream zipIn = new ZipInputStream(new FileInputStream(file.getAbsolutePath()));
|
---|
87 | ZipEntry entry = zipIn.getNextEntry();
|
---|
88 | // iterates over entries in the zip file
|
---|
89 | while (entry != null)
|
---|
90 | {
|
---|
91 | String filePath = file.getParent() + File.separator + entry.getName();
|
---|
92 | if (!entry.isDirectory())
|
---|
93 | {
|
---|
94 | // if the entry is a file, extracts it
|
---|
95 | extractFile(zipIn, filePath);
|
---|
96 | }
|
---|
97 | else
|
---|
98 | {
|
---|
99 | // if the entry is a directory, make the directory
|
---|
100 | File dir = new File(filePath);
|
---|
101 | dir.mkdir();
|
---|
102 | }
|
---|
103 | zipIn.closeEntry();
|
---|
104 | entry = zipIn.getNextEntry();
|
---|
105 | }
|
---|
106 | zipIn.close();
|
---|
107 | }
|
---|
108 | }
|
---|
109 |
|
---|
110 |
|
---|
111 | }
|
---|
112 |
|
---|
113 | static String[] parseTopics(String content)
|
---|
114 | {
|
---|
115 | String[] topics = content.split("\\|");
|
---|
116 |
|
---|
117 | return topics;
|
---|
118 | }
|
---|
119 |
|
---|
120 |
|
---|
121 | static String getArticleId(BufferedReader input) throws IOException
|
---|
122 | {
|
---|
123 | String article_id = "";
|
---|
124 | String line ="";
|
---|
125 | while(!line.equals("--message end--") && !line.equals("0--message end--"))
|
---|
126 | {
|
---|
127 | if (line.startsWith("> ")) {
|
---|
128 | line = line.substring(2);
|
---|
129 | }
|
---|
130 | article_id += line;
|
---|
131 | line=input.readLine();
|
---|
132 | }
|
---|
133 |
|
---|
134 | return article_id;
|
---|
135 | }
|
---|
136 |
|
---|
137 | static List<String> getSynonyms(BufferedReader input) throws IOException
|
---|
138 | {
|
---|
139 | List<String> synonyms = new ArrayList<String>();
|
---|
140 | String line ="";
|
---|
141 | while(!line.equals("--message end--") && !line.equals("0--message end--"))
|
---|
142 | {
|
---|
143 | line=input.readLine();
|
---|
144 | String[] words = line.split("\\|");
|
---|
145 | for (int i=1; i<words.length; i++) {
|
---|
146 | // deliberately skip the first value, which is the number of items in the x|y|z list
|
---|
147 | synonyms.add(words[i]);
|
---|
148 | }
|
---|
149 |
|
---|
150 | }
|
---|
151 | return synonyms;
|
---|
152 | }
|
---|
153 |
|
---|
154 |
|
---|
155 | static String readFile(String path, Charset encoding)
|
---|
156 | throws IOException
|
---|
157 | {
|
---|
158 | byte[] encoded = Files.readAllBytes(Paths.get(path));
|
---|
159 | return new String(encoded, encoding);
|
---|
160 | }
|
---|
161 |
|
---|
162 |
|
---|
163 | private static void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
|
---|
164 | BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
|
---|
165 | byte[] bytesIn = new byte[BUFFER_SIZE];
|
---|
166 | int read = 0;
|
---|
167 | while ((read = zipIn.read(bytesIn)) != -1) {
|
---|
168 | bos.write(bytesIn, 0, read);
|
---|
169 | }
|
---|
170 | bos.close();
|
---|
171 | }
|
---|
172 |
|
---|
173 | static void processFiles(File file, BufferedReader input, BufferedWriter output) throws IOException {
|
---|
174 |
|
---|
175 | if (!file.exists())
|
---|
176 | {
|
---|
177 | System.out.println(file + " does not exist.");
|
---|
178 | }
|
---|
179 | if (file.isDirectory())
|
---|
180 | {
|
---|
181 | for (File f : file.listFiles())
|
---|
182 | {
|
---|
183 | processFiles(f, input, output);
|
---|
184 | }
|
---|
185 | }
|
---|
186 | else
|
---|
187 | {
|
---|
188 | String filename = file.getName().toLowerCase();
|
---|
189 |
|
---|
190 | if (filename.endsWith(".txt.topics"))
|
---|
191 | {
|
---|
192 | String content = readFile(file.getAbsolutePath(), StandardCharsets.UTF_8);
|
---|
193 | String[] topics = parseTopics(content);
|
---|
194 |
|
---|
195 | ArrayList<String> new_topics = new ArrayList<String>();
|
---|
196 |
|
---|
197 | for (int ti=0; ti<topics.length; ti++) {
|
---|
198 |
|
---|
199 | String topic = topics[ti];
|
---|
200 |
|
---|
201 | if (topic.equals("")) { continue; }
|
---|
202 |
|
---|
203 | if (!TopicsToSynonyms.containsKey(topic)) {
|
---|
204 | // Step 1, convert each topic to article id
|
---|
205 | output.write("artid " + topic + "\n");
|
---|
206 | output.flush();
|
---|
207 | System.out.println("Topic '" + topic + "' sent for article id lookup.");
|
---|
208 | String article_id = getArticleId(input);
|
---|
209 | System.out.println(" '" + topic + "' received article id: " + article_id);
|
---|
210 |
|
---|
211 | // Step 2, look up synonyms for each article id
|
---|
212 | output.write("synonyms " + article_id + "\n");
|
---|
213 | output.flush();
|
---|
214 | System.out.println(" Article id " + article_id + " send for synonym analysis.");
|
---|
215 | List<String> synonyms = getSynonyms(input);
|
---|
216 | System.out.println(" " + article_id + " synonyms received (" + synonyms.size() + ")");
|
---|
217 |
|
---|
218 | // Step 3, store in hashmap
|
---|
219 | TopicsToSynonyms.put(topic,synonyms);
|
---|
220 |
|
---|
221 | }
|
---|
222 | else {
|
---|
223 | System.out.println(" ... skipping " + topic + "as it is already in the hashmap");
|
---|
224 | }
|
---|
225 | }
|
---|
226 | System.out.println(file + " processed.");
|
---|
227 | System.out.println("=====");
|
---|
228 |
|
---|
229 | }
|
---|
230 | else
|
---|
231 | {
|
---|
232 | System.out.println("Skipped " + filename);
|
---|
233 | }
|
---|
234 | }
|
---|
235 | }
|
---|
236 |
|
---|
237 | public static void serializeHashmap(String filename)
|
---|
238 | {
|
---|
239 | try {
|
---|
240 | FileOutputStream fos = new FileOutputStream(filename);
|
---|
241 | ObjectOutputStream oos = new ObjectOutputStream(fos);
|
---|
242 | oos.writeObject(TopicsToSynonyms);
|
---|
243 | oos.close();
|
---|
244 | fos.close();
|
---|
245 | System.out.printf("Serialized HashMap data saved to:" + filename);
|
---|
246 | }
|
---|
247 | catch(IOException ioe) {
|
---|
248 |
|
---|
249 | ioe.printStackTrace();
|
---|
250 | }
|
---|
251 | }
|
---|
252 | }
|
---|