source: other-projects/is-sheet-music-encore/trunk/java-gen-corpus/javaGenValidIDList.java@ 33047

Last change on this file since 33047 was 33047, checked in by cpb16, 5 years ago

Corpus generator complete!

File size: 3.7 KB
Line 
1//NAME:Caleb Bird
2//ID: 1289680
3//??References??
4
5import java.io.BufferedReader;
6import java.io.FileReader;
7import java.io.FileWriter;
8import java.io.*;
9import java.util.*;
10
11
12public class javaGenValidIDList {
13 public static void main(String[] args) {
14 try{
15 if (args.length != 3){
16 System.out.println("Usage: TabRndListGen <inputFilename> <outputCount> <outputFilename>");
17 }
18 else{
19 //|||PART1|||
20 String inputFilename = args[0];
21 int outputCount = Integer.parseInt(args[1]);
22 String outputFilename = args[2];
23 BufferedReader buf = new BufferedReader(new FileReader(inputFilename));
24 FileWriter fw = new FileWriter(outputFilename);
25 ArrayList<String> list = new ArrayList<String>();
26 String line = null;
27 String[] item;
28
29 //Add items to Array
30 while ((line = buf.readLine()) != null) {
31 //Split line by tab
32 item = line.split("\t", -1);
33 //Add first element (ID)
34 list.add(item[0]);
35 }
36 //randomize list
37 Collections.shuffle(list);
38
39 //||||PART2||||
40 //Go thru list, check if ID has more than 10 pages,
41 //Keep looping until spesified outputCount has been meet.
42
43 int countMatchingCrit = 0;
44 int j = 0;
45 ArrayList<String> subList = new ArrayList<String>();
46
47 while(countMatchingCrit < outputCount){
48 System.out.println("Processing item: " + j);
49 //current ID being processed
50 String idCurr = list.get(j);
51 //name of bashscript and the arguement
52 String cmd = "./download_metadata_temp.sh " + idCurr;
53 //run download script
54 Process p = Runtime.getRuntime().exec(cmd);
55
56 p.waitFor();
57 int exitStatus = p.exitValue();
58 //if script execution was successful
59 if(exitStatus == 0){
60 //Get numpage value for idCurr
61 int numpages = getNumPages(idCurr);
62 //if numpages greater than 10 then add this recorded to sublist
63 if(numpages >= 10){
64 System.out.println("Successful items: " + countMatchingCrit);
65 countMatchingCrit++;
66 subList.add(idCurr);
67 }
68 }
69 else{
70 System.err.println("failed to run:" + cmd);
71 System.exit(exitStatus);
72 }
73 j++;
74 if(j >= list.size()){break;}
75 }
76
77 //Write these to new file
78 for(int i =0; i < subList.size(); i++){
79 fw.write(subList.get(i) + '\n');
80 }
81
82 buf.close();
83 fw.close();
84 }
85 }catch(Exception e){
86 e.printStackTrace();
87 }
88 }
89 //Isolate numpages value in the downloaded xml file (script downloaded this)
90 private static int getNumPages(String id){
91 String numpages = null;
92 try{
93 //Variables
94 String inputFilename = "metadata_temp.xml";
95 FileReader fileReader = new FileReader(inputFilename);
96 BufferedReader buf = new BufferedReader(fileReader);
97 String line = null;
98 String[] item;
99
100 //Splits into each record
101 while ((line = buf.readLine()) != null) {
102 if(line.contains("<htd:numpages>")){
103 //Iscolate and store the page number
104 numpages = line.substring(line.indexOf(">")+1, line.lastIndexOf("<"));
105 }
106 }
107 buf.close();
108 }catch(Exception e){
109 e.printStackTrace();
110 }
111 return Integer.parseInt(numpages);
112 }
113}
114// Returns 42474
115
116//REFERNECES
117//https://stackoverflow.com/questions/12892665/how-to-capture-the-exit-status-of-a-shell-command-in-java
118//https://www.javatpoint.com/java-filewriter-class
119//https://docs.oracle.com/javase/8/docs/api/index.html?java/io/FileWriter.html
120//https://www.geeksforgeeks.org/randomly-select-items-from-a-list-in-java/
121//https://codereview.stackexchange.com/questions/146551/picking-10-distinct-words-randomly-from-list-of-unique-words
122//http://www.linuxforums.org/forum/programming-scripting/65117-c-c-system-function-analog-java.html
123
124//USE RUN-LIST.txt (modify file it reads using $1 (terminal entry variable (filename))
125
Note: See TracBrowser for help on using the repository browser.