[33007] | 1 | //NAME:Caleb Bird
|
---|
| 2 | //ID: 1289680
|
---|
| 3 | //??References??
|
---|
| 4 |
|
---|
| 5 | import java.io.BufferedReader;
|
---|
| 6 | import java.io.FileReader;
|
---|
| 7 | import java.io.FileWriter;
|
---|
[33044] | 8 | import java.io.*;
|
---|
[33007] | 9 | import java.util.*;
|
---|
| 10 |
|
---|
| 11 | public class TabRndListGen {
|
---|
| 12 | public static void main(String[] args) {
|
---|
| 13 | try{
|
---|
| 14 | if (args.length != 3){
|
---|
[33031] | 15 | System.out.println("Usage: TabRndListGen <inputFilename> <outputCount> <outputFilename>");
|
---|
[33007] | 16 | }
|
---|
| 17 | else{
|
---|
| 18 | String inputFilename = args[0];
|
---|
| 19 | int outputCount = Integer.parseInt(args[1]);
|
---|
| 20 | String outputFilename = args[2];
|
---|
| 21 |
|
---|
| 22 | BufferedReader buf = new BufferedReader(new FileReader(inputFilename));
|
---|
| 23 | FileWriter fw = new FileWriter(outputFilename);
|
---|
| 24 | //FileWriter fw = new FileWriter("hathiRndIDList.txt");
|
---|
| 25 | //BufferedReader buf = new BufferedReader(new FileReader("hathiDocIDList.txt"));
|
---|
| 26 |
|
---|
| 27 | ArrayList<String> list = new ArrayList<String>();
|
---|
[33044] | 28 |
|
---|
[33007] | 29 | String line = null;
|
---|
[33031] | 30 | String[] item;
|
---|
[33007] | 31 |
|
---|
| 32 | //Add items to Array
|
---|
| 33 | while ((line = buf.readLine()) != null) {
|
---|
[33031] | 34 | //Split line by tab
|
---|
| 35 | item = line.split("\t", -1);
|
---|
| 36 | //Add first element (ID)
|
---|
| 37 | list.add(item[0]);
|
---|
[33007] | 38 | }
|
---|
| 39 | //randomize list
|
---|
| 40 | Collections.shuffle(list);
|
---|
[33044] | 41 |
|
---|
| 42 | int countMatchingCrit = 0;
|
---|
| 43 | int j = 0;
|
---|
| 44 | ArrayList<String> subList = new ArrayList<String>();
|
---|
| 45 | //Download meta record, check if numpages >=10
|
---|
| 46 | while(countMatchingCrit < outputCount){
|
---|
| 47 | System.out.println("Processing item: " + j);
|
---|
| 48 | String idCurr = list.get(j);
|
---|
| 49 | //run download script
|
---|
| 50 | String cmd = "./download_metadata_temp.sh " + idCurr;
|
---|
| 51 | Process p = Runtime.getRuntime().exec(cmd);
|
---|
| 52 | /*BufferedReader br = new BufferedReader(new InputStreamReader(p.getInputStream()));
|
---|
| 53 | while(br.ready())
|
---|
| 54 | {
|
---|
| 55 | System.out.println(br.readLine());
|
---|
| 56 | }*/
|
---|
| 57 | p.waitFor();
|
---|
| 58 | int exitStatus = p.exitValue();
|
---|
| 59 | if(exitStatus == 0){
|
---|
| 60 | //Extract numpages
|
---|
| 61 | int numpages = getNumPages(idCurr);
|
---|
| 62 | if(numpages >= 10){
|
---|
| 63 | System.out.println("Successful items: " + countMatchingCrit);
|
---|
| 64 | countMatchingCrit++;
|
---|
| 65 | subList.add(idCurr);
|
---|
| 66 | }
|
---|
| 67 | }
|
---|
| 68 | else{
|
---|
| 69 | System.err.println("failed to run:" + cmd);
|
---|
| 70 | System.exit(exitStatus);
|
---|
| 71 | }
|
---|
| 72 | j++;
|
---|
| 73 | if(j >= list.size()){break;}
|
---|
| 74 | }
|
---|
| 75 |
|
---|
[33007] | 76 |
|
---|
| 77 | //Write these to new file
|
---|
| 78 | for(int i =0; i < subList.size(); i++){
|
---|
| 79 | fw.write(subList.get(i) + '\n');
|
---|
| 80 | }
|
---|
| 81 |
|
---|
| 82 | buf.close();
|
---|
| 83 | fw.close();
|
---|
| 84 | }
|
---|
| 85 | }catch(Exception e){
|
---|
| 86 | e.printStackTrace();
|
---|
| 87 | }
|
---|
| 88 | }
|
---|
[33044] | 89 | private static int getNumPages(String id){
|
---|
| 90 | String numpages = null;
|
---|
| 91 | try{
|
---|
| 92 | //Variables
|
---|
| 93 | String inputFilename = "metadata_temp.xml";
|
---|
| 94 | FileReader fileReader = new FileReader(inputFilename);
|
---|
| 95 | BufferedReader buf = new BufferedReader(fileReader);
|
---|
| 96 | String line = null;
|
---|
| 97 | String[] item;
|
---|
| 98 |
|
---|
| 99 | //Splits into each record
|
---|
| 100 | while ((line = buf.readLine()) != null) {
|
---|
| 101 | /*
|
---|
| 102 | if(line.contains("<id>")){
|
---|
| 103 | //Isoclate and store the id from the line
|
---|
| 104 | idLine = line.substring(line.indexOf(">")+1, line.indexOf("</"));
|
---|
| 105 | id = idLine.substring(idLine.lastIndexOf("meta/")+5);
|
---|
| 106 |
|
---|
| 107 | }
|
---|
| 108 | */
|
---|
| 109 | if(line.contains("<htd:numpages>")){
|
---|
| 110 | //Iscolate and store the page number
|
---|
| 111 | numpages = line.substring(line.indexOf(">")+1, line.lastIndexOf("<"));
|
---|
| 112 |
|
---|
| 113 | }
|
---|
| 114 | }
|
---|
| 115 | buf.close();
|
---|
| 116 | //Check if there are more than 10 pages
|
---|
| 117 |
|
---|
| 118 | }catch(Exception e){
|
---|
| 119 | e.printStackTrace();
|
---|
| 120 | }
|
---|
| 121 | return Integer.parseInt(numpages);
|
---|
| 122 | }
|
---|
[33007] | 123 | }
|
---|
| 124 | // Returns 42474
|
---|
| 125 |
|
---|
| 126 | //REFERNECES
|
---|
[33044] | 127 | //https://stackoverflow.com/questions/12892665/how-to-capture-the-exit-status-of-a-shell-command-in-java
|
---|
[33007] | 128 | //https://www.javatpoint.com/java-filewriter-class
|
---|
| 129 | //https://docs.oracle.com/javase/8/docs/api/index.html?java/io/FileWriter.html
|
---|
| 130 | //https://www.geeksforgeeks.org/randomly-select-items-from-a-list-in-java/
|
---|
| 131 | //https://codereview.stackexchange.com/questions/146551/picking-10-distinct-words-randomly-from-list-of-unique-words
|
---|
[33044] | 132 | //http://www.linuxforums.org/forum/programming-scripting/65117-c-c-system-function-analog-java.html
|
---|
[33007] | 133 |
|
---|
| 134 | //USE RUN-LIST.txt (modify file it reads using $1 (terminal entry variable (filename))
|
---|
| 135 |
|
---|