source: main/trunk/model-sites-dev/hathitrust/collect/capisco-european-pacific-encounters/java/CapiscoTopicToSynonyms.java@ 31289

Last change on this file since 31289 was 31289, checked in by davidb, 7 years ago

initial setup files for collection

File size: 6.7 KB
Line 
1
2import java.util.zip.*;
3import java.io.BufferedOutputStream;
4import java.io.BufferedReader;
5import java.io.BufferedWriter;
6import java.io.File;
7import java.io.FileInputStream;
8import java.io.FileNotFoundException;
9import java.io.FileOutputStream;
10import java.io.FileReader;
11import java.io.IOException;
12import java.io.InputStreamReader;
13import java.io.ObjectOutputStream;
14import java.io.OutputStreamWriter;
15import java.net.Socket;
16import java.nio.charset.Charset;
17import java.nio.charset.StandardCharsets;
18import java.nio.file.Files;
19import java.nio.file.Paths;
20import java.util.ArrayList;
21import java.util.HashMap;
22import java.util.LinkedList;
23import java.util.List;
24
25
26
27public class CapiscoTopicToSynonyms {
28
29 private static final int BUFFER_SIZE = 4096;
30
31 private static HashMap<String,List<String>> TopicsToSynonyms = new HashMap<String,List<String>>();
32
33 public static void main(String[] args) {
34
35
36 if ((args.length==0) || (args.length!=3) || (args[0].endsWith("help"))) {
37 System.err.println();
38 System.err.println("Usage: CapiscoTopicToSynonyms 'server URL' 'server port' 'input documents path'");
39 System.err.println();
40 System.exit(-1);
41 }
42
43 String serverName = args[0];
44 int port = Integer.parseInt(args[1]);
45 String documentspath = args[2];
46
47 try {
48 System.out.println("Connecting to " + serverName
49 + " on port " + port);
50 Socket client = new Socket(serverName, port);
51 System.out.println("Just connected to "
52 + client.getRemoteSocketAddress());
53 BufferedReader in = new BufferedReader(new InputStreamReader(client.getInputStream()));
54 BufferedWriter out= new BufferedWriter(new OutputStreamWriter(client.getOutputStream()));
55 File documents = new File(documentspath);
56
57 unzipAll(documents);
58 processFiles(documents, in, out);
59
60
61 serializeHashmap("topics-to-synonyms.ser");
62
63 client.close();
64
65 }
66 catch(IOException e) {
67 System.out.println("error: "+ e.getMessage());
68 e.printStackTrace();
69 }
70 }
71
72 private static void unzipAll(File file) throws IOException {
73
74 if (file.isDirectory())
75 {
76 for (File f : file.listFiles())
77 {
78 unzipAll(f);
79 }
80 }
81 else
82 {
83 String filename = file.getName().toLowerCase();
84 if(filename.endsWith(".zip"))
85 {
86 ZipInputStream zipIn = new ZipInputStream(new FileInputStream(file.getAbsolutePath()));
87 ZipEntry entry = zipIn.getNextEntry();
88 // iterates over entries in the zip file
89 while (entry != null)
90 {
91 String filePath = file.getParent() + File.separator + entry.getName();
92 if (!entry.isDirectory())
93 {
94 // if the entry is a file, extracts it
95 extractFile(zipIn, filePath);
96 }
97 else
98 {
99 // if the entry is a directory, make the directory
100 File dir = new File(filePath);
101 dir.mkdir();
102 }
103 zipIn.closeEntry();
104 entry = zipIn.getNextEntry();
105 }
106 zipIn.close();
107 }
108 }
109
110
111 }
112
113 static String[] parseTopics(String content)
114 {
115 String[] topics = content.split("\\|");
116
117 return topics;
118 }
119
120
121 static String getArticleId(BufferedReader input) throws IOException
122 {
123 String article_id = "";
124 String line ="";
125 while(!line.equals("--message end--") && !line.equals("0--message end--"))
126 {
127 if (line.startsWith("> ")) {
128 line = line.substring(2);
129 }
130 article_id += line;
131 line=input.readLine();
132 }
133
134 return article_id;
135 }
136
137 static List<String> getSynonyms(BufferedReader input) throws IOException
138 {
139 List<String> synonyms = new ArrayList<String>();
140 String line ="";
141 while(!line.equals("--message end--") && !line.equals("0--message end--"))
142 {
143 line=input.readLine();
144 String[] words = line.split("\\|");
145 for (int i=1; i<words.length; i++) {
146 // deliberately skip the first value, which is the number of items in the x|y|z list
147 synonyms.add(words[i]);
148 }
149
150 }
151 return synonyms;
152 }
153
154
155 static String readFile(String path, Charset encoding)
156 throws IOException
157 {
158 byte[] encoded = Files.readAllBytes(Paths.get(path));
159 return new String(encoded, encoding);
160 }
161
162
163 private static void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
164 BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
165 byte[] bytesIn = new byte[BUFFER_SIZE];
166 int read = 0;
167 while ((read = zipIn.read(bytesIn)) != -1) {
168 bos.write(bytesIn, 0, read);
169 }
170 bos.close();
171 }
172
173 static void processFiles(File file, BufferedReader input, BufferedWriter output) throws IOException {
174
175 if (!file.exists())
176 {
177 System.out.println(file + " does not exist.");
178 }
179 if (file.isDirectory())
180 {
181 for (File f : file.listFiles())
182 {
183 processFiles(f, input, output);
184 }
185 }
186 else
187 {
188 String filename = file.getName().toLowerCase();
189
190 if (filename.endsWith(".txt.topics"))
191 {
192 String content = readFile(file.getAbsolutePath(), StandardCharsets.UTF_8);
193 String[] topics = parseTopics(content);
194
195 ArrayList<String> new_topics = new ArrayList<String>();
196
197 for (int ti=0; ti<topics.length; ti++) {
198
199 String topic = topics[ti];
200
201 if (topic.equals("")) { continue; }
202
203 if (!TopicsToSynonyms.containsKey(topic)) {
204 // Step 1, convert each topic to article id
205 output.write("artid " + topic + "\n");
206 output.flush();
207 System.out.println("Topic '" + topic + "' sent for article id lookup.");
208 String article_id = getArticleId(input);
209 System.out.println(" '" + topic + "' received article id: " + article_id);
210
211 // Step 2, look up synonyms for each article id
212 output.write("synonyms " + article_id + "\n");
213 output.flush();
214 System.out.println(" Article id " + article_id + " send for synonym analysis.");
215 List<String> synonyms = getSynonyms(input);
216 System.out.println(" " + article_id + " synonyms received (" + synonyms.size() + ")");
217
218 // Step 3, store in hashmap
219 TopicsToSynonyms.put(topic,synonyms);
220
221 }
222 else {
223 System.out.println(" ... skipping " + topic + "as it is already in the hashmap");
224 }
225 }
226 System.out.println(file + " processed.");
227 System.out.println("=====");
228
229 }
230 else
231 {
232 System.out.println("Skipped " + filename);
233 }
234 }
235 }
236
237 public static void serializeHashmap(String filename)
238 {
239 try {
240 FileOutputStream fos = new FileOutputStream(filename);
241 ObjectOutputStream oos = new ObjectOutputStream(fos);
242 oos.writeObject(TopicsToSynonyms);
243 oos.close();
244 fos.close();
245 System.out.printf("Serialized HashMap data saved to:" + filename);
246 }
247 catch(IOException ioe) {
248
249 ioe.printStackTrace();
250 }
251 }
252}
Note: See TracBrowser for help on using the repository browser.