source: main/trunk/model-sites-dev/hathitrust/collect/capisco-european-pacific-encounters/java/CapiscoTextToTopics.java@ 31289

Last change on this file since 31289 was 31289, checked in by davidb, 7 years ago

initial setup files for collection

File size: 5.1 KB
Line 
1
2import java.util.zip.*;
3import java.io.BufferedOutputStream;
4import java.io.BufferedReader;
5import java.io.BufferedWriter;
6import java.io.File;
7import java.io.FileInputStream;
8import java.io.FileNotFoundException;
9import java.io.FileOutputStream;
10import java.io.FileReader;
11import java.io.IOException;
12import java.io.InputStreamReader;
13import java.io.OutputStreamWriter;
14import java.net.Socket;
15import java.nio.charset.Charset;
16import java.nio.charset.StandardCharsets;
17import java.nio.file.Files;
18import java.nio.file.Paths;
19import java.util.LinkedList;
20import java.util.List;
21
22
23public class CapiscoTextToTopics {
24
25 private static final int BUFFER_SIZE = 4096;
26
27 public static void main(String[] args) {
28
29
30 if ((args.length==0) || (args.length!=3) || (args[0].endsWith("help"))) {
31 System.err.println();
32 System.err.println("Usage: CapiscoTextToTopic 'server URL' 'server port' 'input documents path'");
33 System.err.println();
34 System.exit(-1);
35 }
36
37 String serverName = args[0];
38 int port = Integer.parseInt(args[1]);
39 String documentspath = args[2];
40
41 try {
42 System.out.println("Connecting to " + serverName
43 + " on port " + port);
44 Socket client = new Socket(serverName, port);
45 System.out.println("Just connected to "
46 + client.getRemoteSocketAddress());
47 BufferedReader in = new BufferedReader(new InputStreamReader(client.getInputStream()));
48 BufferedWriter out= new BufferedWriter(new OutputStreamWriter(client.getOutputStream()));
49 File documents = new File(documentspath);
50
51 unzipAll(documents);
52 processFiles(documents, in, out);
53
54 client.close();
55
56 }
57 catch(IOException e) {
58 System.out.println("error: "+ e.getMessage());
59 e.printStackTrace();
60 }
61 }
62
63 private static void unzipAll(File file) throws IOException {
64
65 if (file.isDirectory())
66 {
67 for (File f : file.listFiles())
68 {
69 unzipAll(f);
70 }
71 }
72 else
73 {
74 String filename = file.getName().toLowerCase();
75 if(filename.endsWith(".zip"))
76 {
77 ZipInputStream zipIn = new ZipInputStream(new FileInputStream(file.getAbsolutePath()));
78 ZipEntry entry = zipIn.getNextEntry();
79 // iterates over entries in the zip file
80 while (entry != null)
81 {
82 String filePath = file.getParent() + File.separator + entry.getName();
83 if (!entry.isDirectory())
84 {
85 // if the entry is a file, extracts it
86 extractFile(zipIn, filePath);
87 }
88 else
89 {
90 // if the entry is a directory, make the directory
91 File dir = new File(filePath);
92 dir.mkdir();
93 }
94 zipIn.closeEntry();
95 entry = zipIn.getNextEntry();
96 }
97 zipIn.close();
98 }
99 }
100
101
102 }
103
104
105 static List<String> getTopics(BufferedReader input) throws IOException
106 {
107 List<String> topics = new LinkedList<String>();
108 String line ="";
109 while(!line.equals("--message end--") && !line.equals("0--message end--"))
110 {
111 line=input.readLine();
112 String[] words = line.split("\\|");
113 if(words.length>1)
114 {
115 topics.add(words[1]);
116 }
117 }
118 return topics;
119 }
120
121 static String readFile(String path, Charset encoding)
122 throws IOException
123 {
124 byte[] encoded = Files.readAllBytes(Paths.get(path));
125 return new String(encoded, encoding);
126 }
127
128
129 private static void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
130 BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
131 byte[] bytesIn = new byte[BUFFER_SIZE];
132 int read = 0;
133 while ((read = zipIn.read(bytesIn)) != -1) {
134 bos.write(bytesIn, 0, read);
135 }
136 bos.close();
137 }
138
139 static void processFiles(File file, BufferedReader input, BufferedWriter output) throws IOException {
140
141 if (!file.exists())
142 {
143 System.out.println(file + " does not exist.");
144 }
145 if (file.isDirectory())
146 {
147 for (File f : file.listFiles())
148 {
149 processFiles(f, input, output);
150 }
151 }
152 else
153 {
154 String filename = file.getName().toLowerCase();
155
156 if (filename.endsWith(".htm") || filename.endsWith(".html") || filename.endsWith(".txt"))
157 {
158 List<String> WMtopics;
159 String topicsfilepath = file.getAbsolutePath()+".topics";
160 File topicsfile = new File(topicsfilepath);
161 topicsfile.createNewFile();
162 FileOutputStream fop = null;
163 fop = new FileOutputStream(topicsfile);
164
165 String content = readFile(file.getAbsolutePath(), StandardCharsets.UTF_8);
166 output.write("topics xxxxxx\n ");
167 output.write(content + "\n");
168 output.write("xxxxxx \n");
169 output.flush();
170 System.out.println(file.getAbsolutePath() + " send for analysis.");
171 WMtopics = getTopics(input);
172 System.out.println(file.getAbsolutePath() + " has concepts received.");
173 for (int i = 0; i < WMtopics.size(); i++)
174 {
175 byte[] contentInBytes = (WMtopics.get(i)+"|").getBytes();
176 fop.write(contentInBytes);
177 fop.flush();
178 }
179 fop.close();
180 System.out.println(file.getAbsolutePath() + " processed.");
181
182 }
183 else
184 {
185 System.out.println("Skipped " + filename);
186 }
187 }
188 }
189
190}
Note: See TracBrowser for help on using the repository browser.