Changeset 33044


Ignore:
Timestamp:
05/02/19 14:51:03 (21 months ago)
Author:
cpb16
Message:

Streamlined numpages checking and random selection. Corrected COMPX-RUN-X.sh to download all files (naming error). NEXT: Clean up corpus generation and move on to the next phase

Location:
other-projects/is-sheet-music-encore/trunk
Files:
1 added
8 edited

Legend:

Unmodified
Added
Removed
  • other-projects/is-sheet-music-encore/trunk/COMPX520-RUN-META.sh

    r33017 r33044  
    77
    88doc_id=$1
     9doc_id_file=`echo $doc_id | sed 's/:/+/' | sed 's/\//=/g'`
    910
    10 output_file="java-gen-corpus/download-meta/$doc_id-META.txt"
     11output_file="java-gen-corpus/download-meta/$doc_id_file-META.txt"
    1112echo "Retrieving doc-id-page: $doc_id -> $output_file"
    1213echo ""
  • other-projects/is-sheet-music-encore/trunk/COMPX520-RUN-PNG.sh

    r33010 r33044  
    99page_num=$2
    1010
    11 output_file="download-images/$doc_id-$page_num.png"
     11doc_id_file=`echo $doc_id | sed 's/:/+/' | sed 's/\//=/g'`
     12
     13
     14output_file="download-images/$doc_id_file-$page_num.png"
    1215echo "Retrieving doc-id-page: $doc_id-$page_num -> $output_file"
    1316echo ""
  • other-projects/is-sheet-music-encore/trunk/Makefile

    r33031 r33044  
    33
    44filter-full-50-MU:
    5     #cd java-gen-corpus; java TabProcTextGen hathiFull.txt hathiFullIDList-MU.txt MU
    6     #./COMPX520-DOWNLOADER-META.sh java-gen-corpus/hathiFullIDList-MU.txt
     5    #source SETUP.bash MUST TO THIS MANUALLY
     6    cd java-gen-corpus; java TabProcTextGen hathiFull.txt hathiFullIDList-MU.txt MU
     7    ./COMPX520-DOWNLOADER-META.sh java-gen-corpus/hathiFullIDList-MU.txt
    78    cd java-gen-corpus; ./metadata-formater.sh
    89    cd java-gen-corpus; ./ValidIDListGen.sh MU
    910    cd java-gen-corpus; java TabRndListGen hathiValidIDList-MU.txt 50 hathiValidRnd50List-MU.txt
    1011
     12test:
     13    cd java-gen-corpus; javac *.java
     14    cd java-gen-corpus; java TabProcTextGen hathiFull.txt _test_hathiFullIDList-MU.txt MU
     15    cd java-gen-corpus; java TabRndListGen _test_hathiFullIDList-MU.txt 50 _test_hathiValidIDList-MU-50.txt
    1116
    1217
     18
  • other-projects/is-sheet-music-encore/trunk/dapiclient2-extended-META.pl

    r33014 r33044  
    153153    print $response->content;
    154154}
    155 exit $success;
     155
     156if ($success) {
     157    exit 0;
     158}
     159else {
     160    exit 1;
     161}
    156162
    157163
    158164
     165
  • other-projects/is-sheet-music-encore/trunk/java-gen-corpus/TabProcTextGen.java

    r33031 r33044  
    4747                    item[19].equals(inputType) &&
    4848                    (item[24].equals("open")||item[24].equals("page"))){
     49                        //Encode id
     50                        String idEncoded = item[0];
     51                        //idEncoded = idEncoded.replaceAll(":", "+").replaceAll("/", "=");
    4952                        //Write item to file
    50                         fw.write(item[0] + "\n");
     53                        fw.write(idEncoded + "\n");
    5154                    }
    5255                }       
     
    6366//https://www.javatpoint.com/java-filewriter-class
    6467//https://docs.oracle.com/javase/8/docs/api/index.html?java/io/FileWriter.html
     68//Email from supervisor (Compx520 ID Error Apr 24)
  • other-projects/is-sheet-music-encore/trunk/java-gen-corpus/TabRndListGen.java

    r33031 r33044  
    66import java.io.FileReader;
    77import java.io.FileWriter;
     8import java.io.*;
    89import java.util.*;
    910
     
    2526
    2627        ArrayList<String> list = new ArrayList<String>();
    27         ArrayList<String> subList;
     28   
    2829        String line = null;
    2930        String[] item;
     
    3839        //randomize list
    3940        Collections.shuffle(list);
    40 
    41         //Take <outputCount>
    42         subList = new ArrayList<String>(list.subList(0,outputCount));
     41       
     42        int countMatchingCrit = 0;
     43        int j = 0;
     44        ArrayList<String> subList = new ArrayList<String>();
     45        //Download meta record, check if numpages >=10
     46        while(countMatchingCrit < outputCount){
     47            System.out.println("Processing item: " + j);
     48            String idCurr = list.get(j);
     49            //run download script
     50            String cmd = "./download_metadata_temp.sh " + idCurr;
     51            Process p = Runtime.getRuntime().exec(cmd);
     52            /*BufferedReader br = new BufferedReader(new InputStreamReader(p.getInputStream()));
     53                while(br.ready())
     54                {
     55                        System.out.println(br.readLine());
     56                }*/
     57            p.waitFor();
     58            int exitStatus = p.exitValue();
     59            if(exitStatus == 0){
     60                //Extract numpages
     61                int numpages = getNumPages(idCurr);
     62                if(numpages >= 10){
     63                    System.out.println("Successful items: " + countMatchingCrit);
     64                    countMatchingCrit++;
     65                    subList.add(idCurr);                       
     66                }
     67            }
     68            else{
     69                System.err.println("failed to run:" + cmd);
     70                System.exit(exitStatus);
     71            }
     72            j++;
     73            if(j >= list.size()){break;}
     74        }
     75       
    4376
    4477        //Write these to new file
     
    5487    }
    5588    }
     89    private static int getNumPages(String id){
     90        String numpages = null;
     91        try{
     92                //Variables
     93                String inputFilename = "metadata_temp.xml";
     94                FileReader fileReader = new FileReader(inputFilename);
     95                BufferedReader buf = new BufferedReader(fileReader);
     96                String line = null;
     97                String[] item;
     98
     99                //Splits into each record
     100                while ((line = buf.readLine()) != null) {                   
     101                    /*
     102                    if(line.contains("<id>")){
     103                        //Isoclate and store the id from the line
     104                        idLine = line.substring(line.indexOf(">")+1, line.indexOf("</"));
     105                        id  = idLine.substring(idLine.lastIndexOf("meta/")+5);
     106                       
     107                    }
     108                    */ 
     109                    if(line.contains("<htd:numpages>")){
     110                        //Iscolate and store the page number
     111                        numpages = line.substring(line.indexOf(">")+1, line.lastIndexOf("<"));
     112                                       
     113                    }                                           
     114                }
     115                buf.close();
     116                //Check if there are more than 10 pages
     117   
     118        }catch(Exception e){
     119            e.printStackTrace();
     120        }
     121        return Integer.parseInt(numpages);
     122    }
    56123}
    57124// Returns 42474
    58125
    59126//REFERNECES
     127//https://stackoverflow.com/questions/12892665/how-to-capture-the-exit-status-of-a-shell-command-in-java
    60128//https://www.javatpoint.com/java-filewriter-class
    61129//https://docs.oracle.com/javase/8/docs/api/index.html?java/io/FileWriter.html
    62130//https://www.geeksforgeeks.org/randomly-select-items-from-a-list-in-java/
    63131//https://codereview.stackexchange.com/questions/146551/picking-10-distinct-words-randomly-from-list-of-unique-words
     132//http://www.linuxforums.org/forum/programming-scripting/65117-c-c-system-function-analog-java.html
    64133
    65134//USE RUN-LIST.txt (modify file it reads using $1 (terminal entry variable (filename))
Note: See TracChangeset for help on using the changeset viewer.