[30416] | 1 |
|
---|
| 2 | import java.io.*;
|
---|
| 3 | import java.net.*;
|
---|
| 4 | import java.nio.*;
|
---|
| 5 | import java.nio.channels.*;
|
---|
| 6 | import java.nio.file.*;
|
---|
| 7 | import java.util.*;
|
---|
| 8 |
|
---|
| 9 | import org.jsoup.Jsoup;
|
---|
| 10 | import org.jsoup.nodes.Document;
|
---|
| 11 | import org.jsoup.nodes.Element;
|
---|
| 12 | import org.jsoup.select.Elements;
|
---|
| 13 |
|
---|
| 14 |
|
---|
| 15 | class AMCArtistScrape
|
---|
| 16 | {
|
---|
| 17 |
|
---|
[30418] | 18 | private static void printf(String msg, Object... args)
|
---|
| 19 | {
|
---|
| 20 | System.out.println(String.format(msg, args));
|
---|
| 21 | }
|
---|
| 22 |
|
---|
[30416] | 23 | public static void downloadURL(String url_str) {
|
---|
| 24 |
|
---|
| 25 | try {
|
---|
| 26 | URL url = new URL(url_str);
|
---|
| 27 |
|
---|
| 28 | String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() );
|
---|
[30418] | 29 | File local_file = new File("audio",local_file_str);
|
---|
[30416] | 30 |
|
---|
| 31 | if (!local_file.exists()) {
|
---|
| 32 | System.out.printf("Downloading audio to '%s'...\n",local_file_str);
|
---|
| 33 | FileOutputStream fos = new FileOutputStream(local_file);
|
---|
| 34 |
|
---|
| 35 | ReadableByteChannel rbc = Channels.newChannel(url.openStream());
|
---|
| 36 | fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
|
---|
| 37 |
|
---|
| 38 | System.out.println("... done");
|
---|
| 39 | }
|
---|
| 40 | else {
|
---|
| 41 | System.out.printf("Skipping '%s' as is already exists\n",local_file_str);
|
---|
| 42 | }
|
---|
| 43 |
|
---|
| 44 | }
|
---|
| 45 | catch (Exception e) {
|
---|
| 46 | e.printStackTrace();
|
---|
| 47 | }
|
---|
| 48 | }
|
---|
| 49 |
|
---|
[30418] | 50 | public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc)
|
---|
[30416] | 51 | {
|
---|
| 52 |
|
---|
| 53 | ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
|
---|
| 54 |
|
---|
[30418] | 55 | Elements table_rows = doc.select("table.search_results tr");
|
---|
| 56 |
|
---|
| 57 | for (Element tr : table_rows) {
|
---|
[30416] | 58 |
|
---|
[30418] | 59 | Elements td_cells = tr.select("td");
|
---|
[30416] | 60 |
|
---|
[30418] | 61 | if (td_cells.size() == 6) {
|
---|
| 62 | Element title_anchor = td_cells.get(0).select("a").first();
|
---|
| 63 | String track_url = title_anchor.attr("href");
|
---|
| 64 | String title = title_anchor.ownText();
|
---|
| 65 | String instrumentation = td_cells.get(1).ownText();
|
---|
| 66 | String composer = td_cells.get(2).ownText();
|
---|
| 67 | String year = td_cells.get(3).ownText();
|
---|
| 68 |
|
---|
| 69 | Element audio_elem = td_cells.get(4).select("audio").first();
|
---|
| 70 | if (audio_elem != null) {
|
---|
| 71 | String audio_url = audio_elem.attr("src");
|
---|
| 72 |
|
---|
| 73 | MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
|
---|
| 74 | System.out.println(mr.toJSONString());
|
---|
| 75 |
|
---|
| 76 | records.add(mr);
|
---|
| 77 | }
|
---|
| 78 |
|
---|
| 79 | }
|
---|
[30416] | 80 |
|
---|
[30418] | 81 | }
|
---|
| 82 |
|
---|
| 83 | return records;
|
---|
| 84 | }
|
---|
[30416] | 85 |
|
---|
[30418] | 86 | private static void followArtistPages(String base_domain,String base_url, String artist_suffix)
|
---|
| 87 | {
|
---|
| 88 | String artist_url_str = base_domain+base_url+artist_suffix;
|
---|
[30416] | 89 |
|
---|
[30418] | 90 | try {
|
---|
| 91 | boolean has_next_page = true;
|
---|
| 92 | int page_number = 1;
|
---|
[30416] | 93 |
|
---|
[30418] | 94 | while (has_next_page) {
|
---|
| 95 | System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number);
|
---|
[30416] | 96 |
|
---|
[30418] | 97 | Document doc = Jsoup.connect(artist_url_str).get();
|
---|
[30416] | 98 |
|
---|
[30418] | 99 | ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc);
|
---|
| 100 |
|
---|
| 101 | for (MusicRecord mr: music_records) {
|
---|
| 102 | downloadURL(mr.getAudioURL());
|
---|
| 103 | }
|
---|
| 104 |
|
---|
| 105 | // <ul class="pagination">
|
---|
| 106 | // ...
|
---|
| 107 | // <li href="URL-TO-FOLLOW">Next</li>
|
---|
| 108 | // </ul>
|
---|
| 109 |
|
---|
| 110 | Elements pagination_block = doc.select("ul.pagination > li > a");
|
---|
| 111 | Element last_li_a = pagination_block.last();
|
---|
| 112 | String last_li_a_text = last_li_a.text();
|
---|
| 113 | if (last_li_a_text.equals("Next >")) {
|
---|
| 114 | artist_url_str = base_domain+last_li_a.attr("href");
|
---|
| 115 | }
|
---|
| 116 | else {
|
---|
| 117 | has_next_page = false;
|
---|
| 118 | }
|
---|
| 119 |
|
---|
| 120 | page_number++;
|
---|
| 121 |
|
---|
| 122 | //System.out.println("*** last li a text = " + last_li_a_text);
|
---|
| 123 | //System.out.println("*** artist_url_str = " + artist_url_str);
|
---|
[30416] | 124 | }
|
---|
| 125 | }
|
---|
| 126 | catch (Exception e) {
|
---|
| 127 | e.printStackTrace();
|
---|
| 128 | }
|
---|
| 129 | }
|
---|
| 130 |
|
---|
| 131 |
|
---|
| 132 |
|
---|
| 133 | public static void main(String[] args)
|
---|
| 134 | {
|
---|
| 135 |
|
---|
[30418] | 136 | // Example artist page at AMC
|
---|
| 137 | // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
|
---|
[30416] | 138 |
|
---|
[30418] | 139 | String base_domain = "http://www.australianmusiccentre.com.au";
|
---|
| 140 | String base_url = "/search?type=work&sort=alphaTitleSort";
|
---|
| 141 | base_url += "&wfc[]=";
|
---|
[30416] | 142 |
|
---|
[30418] | 143 | followArtistPages(base_domain,base_url,"Roger+Dean");
|
---|
[30416] | 144 | }
|
---|
| 145 | }
|
---|