source: other-projects/is-sheet-music-encore/trunk/java-gen-corpus/TabRndListGen.java@ 33044

Last change on this file since 33044 was 33044, checked in by cpb16, 5 years ago

Streamlined numpages checking and random selection. Corrected COMPX-RUN-X.sh to download all files (naming error). NEXT: Clean up corpus generation and move on to the next phase

File size: 3.9 KB
Line 
1//NAME:Caleb Bird
2//ID: 1289680
3//??References??
4
5import java.io.BufferedReader;
6import java.io.FileReader;
7import java.io.FileWriter;
8import java.io.*;
9import java.util.*;
10
11public class TabRndListGen {
12 public static void main(String[] args) {
13 try{
14 if (args.length != 3){
15 System.out.println("Usage: TabRndListGen <inputFilename> <outputCount> <outputFilename>");
16 }
17 else{
18 String inputFilename = args[0];
19 int outputCount = Integer.parseInt(args[1]);
20 String outputFilename = args[2];
21
22 BufferedReader buf = new BufferedReader(new FileReader(inputFilename));
23 FileWriter fw = new FileWriter(outputFilename);
24 //FileWriter fw = new FileWriter("hathiRndIDList.txt");
25 //BufferedReader buf = new BufferedReader(new FileReader("hathiDocIDList.txt"));
26
27 ArrayList<String> list = new ArrayList<String>();
28
29 String line = null;
30 String[] item;
31
32 //Add items to Array
33 while ((line = buf.readLine()) != null) {
34 //Split line by tab
35 item = line.split("\t", -1);
36 //Add first element (ID)
37 list.add(item[0]);
38 }
39 //randomize list
40 Collections.shuffle(list);
41
42 int countMatchingCrit = 0;
43 int j = 0;
44 ArrayList<String> subList = new ArrayList<String>();
45 //Download meta record, check if numpages >=10
46 while(countMatchingCrit < outputCount){
47 System.out.println("Processing item: " + j);
48 String idCurr = list.get(j);
49 //run download script
50 String cmd = "./download_metadata_temp.sh " + idCurr;
51 Process p = Runtime.getRuntime().exec(cmd);
52 /*BufferedReader br = new BufferedReader(new InputStreamReader(p.getInputStream()));
53 while(br.ready())
54 {
55 System.out.println(br.readLine());
56 }*/
57 p.waitFor();
58 int exitStatus = p.exitValue();
59 if(exitStatus == 0){
60 //Extract numpages
61 int numpages = getNumPages(idCurr);
62 if(numpages >= 10){
63 System.out.println("Successful items: " + countMatchingCrit);
64 countMatchingCrit++;
65 subList.add(idCurr);
66 }
67 }
68 else{
69 System.err.println("failed to run:" + cmd);
70 System.exit(exitStatus);
71 }
72 j++;
73 if(j >= list.size()){break;}
74 }
75
76
77 //Write these to new file
78 for(int i =0; i < subList.size(); i++){
79 fw.write(subList.get(i) + '\n');
80 }
81
82 buf.close();
83 fw.close();
84 }
85 }catch(Exception e){
86 e.printStackTrace();
87 }
88 }
89 private static int getNumPages(String id){
90 String numpages = null;
91 try{
92 //Variables
93 String inputFilename = "metadata_temp.xml";
94 FileReader fileReader = new FileReader(inputFilename);
95 BufferedReader buf = new BufferedReader(fileReader);
96 String line = null;
97 String[] item;
98
99 //Splits into each record
100 while ((line = buf.readLine()) != null) {
101 /*
102 if(line.contains("<id>")){
103 //Isoclate and store the id from the line
104 idLine = line.substring(line.indexOf(">")+1, line.indexOf("</"));
105 id = idLine.substring(idLine.lastIndexOf("meta/")+5);
106
107 }
108 */
109 if(line.contains("<htd:numpages>")){
110 //Iscolate and store the page number
111 numpages = line.substring(line.indexOf(">")+1, line.lastIndexOf("<"));
112
113 }
114 }
115 buf.close();
116 //Check if there are more than 10 pages
117
118 }catch(Exception e){
119 e.printStackTrace();
120 }
121 return Integer.parseInt(numpages);
122 }
123}
124// Returns 42474
125
126//REFERNECES
127//https://stackoverflow.com/questions/12892665/how-to-capture-the-exit-status-of-a-shell-command-in-java
128//https://www.javatpoint.com/java-filewriter-class
129//https://docs.oracle.com/javase/8/docs/api/index.html?java/io/FileWriter.html
130//https://www.geeksforgeeks.org/randomly-select-items-from-a-list-in-java/
131//https://codereview.stackexchange.com/questions/146551/picking-10-distinct-words-randomly-from-list-of-unique-words
132//http://www.linuxforums.org/forum/programming-scripting/65117-c-c-system-function-analog-java.html
133
134//USE RUN-LIST.txt (modify file it reads using $1 (terminal entry variable (filename))
135
Note: See TracBrowser for help on using the repository browser.