1 |
|
---|
2 | import java.io.*;
|
---|
3 | import java.net.*;
|
---|
4 | import java.nio.*;
|
---|
5 | import java.nio.channels.*;
|
---|
6 | import java.nio.file.*;
|
---|
7 | import java.util.*;
|
---|
8 |
|
---|
9 | import org.jsoup.Jsoup;
|
---|
10 | import org.jsoup.nodes.Document;
|
---|
11 | import org.jsoup.nodes.Element;
|
---|
12 | import org.jsoup.select.Elements;
|
---|
13 |
|
---|
14 |
|
---|
15 | class AMCArtistScrape
|
---|
16 | {
|
---|
17 |
|
---|
18 | private static void printf(String msg, Object... args)
|
---|
19 | {
|
---|
20 | System.out.println(String.format(msg, args));
|
---|
21 | }
|
---|
22 |
|
---|
23 | public static void downloadURL(String url_str) {
|
---|
24 |
|
---|
25 | try {
|
---|
26 | URL url = new URL(url_str);
|
---|
27 |
|
---|
28 | String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() );
|
---|
29 | File local_file = new File("audio",local_file_str);
|
---|
30 |
|
---|
31 | if (!local_file.exists()) {
|
---|
32 | System.out.printf("Downloading audio to '%s'...\n",local_file_str);
|
---|
33 | FileOutputStream fos = new FileOutputStream(local_file);
|
---|
34 |
|
---|
35 | ReadableByteChannel rbc = Channels.newChannel(url.openStream());
|
---|
36 | fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
|
---|
37 |
|
---|
38 | System.out.println("... done");
|
---|
39 | }
|
---|
40 | else {
|
---|
41 | System.out.printf("Skipping '%s' as is already exists\n",local_file_str);
|
---|
42 | }
|
---|
43 |
|
---|
44 | }
|
---|
45 | catch (Exception e) {
|
---|
46 | e.printStackTrace();
|
---|
47 | }
|
---|
48 | }
|
---|
49 |
|
---|
50 | public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc)
|
---|
51 | {
|
---|
52 |
|
---|
53 | ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
|
---|
54 |
|
---|
55 | Elements table_rows = doc.select("table.search_results tr");
|
---|
56 |
|
---|
57 | for (Element tr : table_rows) {
|
---|
58 |
|
---|
59 | Elements td_cells = tr.select("td");
|
---|
60 |
|
---|
61 | if (td_cells.size() == 6) {
|
---|
62 | Element title_anchor = td_cells.get(0).select("a").first();
|
---|
63 | String track_url = title_anchor.attr("href");
|
---|
64 | String title = title_anchor.ownText();
|
---|
65 | String instrumentation = td_cells.get(1).ownText();
|
---|
66 | String composer = td_cells.get(2).ownText();
|
---|
67 | String year = td_cells.get(3).ownText();
|
---|
68 |
|
---|
69 | Element audio_elem = td_cells.get(4).select("audio").first();
|
---|
70 | if (audio_elem != null) {
|
---|
71 | String audio_url = audio_elem.attr("src");
|
---|
72 |
|
---|
73 | MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
|
---|
74 | System.out.println(mr.toJSONString());
|
---|
75 |
|
---|
76 | records.add(mr);
|
---|
77 | }
|
---|
78 |
|
---|
79 | }
|
---|
80 |
|
---|
81 | }
|
---|
82 |
|
---|
83 | return records;
|
---|
84 | }
|
---|
85 |
|
---|
86 | private static void followArtistPages(String base_domain,String base_url, String artist_suffix)
|
---|
87 | {
|
---|
88 | String artist_url_str = base_domain+base_url+artist_suffix;
|
---|
89 |
|
---|
90 | try {
|
---|
91 | boolean has_next_page = true;
|
---|
92 | int page_number = 1;
|
---|
93 |
|
---|
94 | while (has_next_page) {
|
---|
95 | System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number);
|
---|
96 |
|
---|
97 | Document doc = Jsoup.connect(artist_url_str).get();
|
---|
98 |
|
---|
99 | ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc);
|
---|
100 |
|
---|
101 | for (MusicRecord mr: music_records) {
|
---|
102 | downloadURL(mr.getAudioURL());
|
---|
103 | }
|
---|
104 |
|
---|
105 | // <ul class="pagination">
|
---|
106 | // ...
|
---|
107 | // <li href="URL-TO-FOLLOW">Next</li>
|
---|
108 | // </ul>
|
---|
109 |
|
---|
110 | Elements pagination_block = doc.select("ul.pagination > li > a");
|
---|
111 | Element last_li_a = pagination_block.last();
|
---|
112 | String last_li_a_text = last_li_a.text();
|
---|
113 | if (last_li_a_text.equals("Next >")) {
|
---|
114 | artist_url_str = base_domain+last_li_a.attr("href");
|
---|
115 | }
|
---|
116 | else {
|
---|
117 | has_next_page = false;
|
---|
118 | }
|
---|
119 |
|
---|
120 | page_number++;
|
---|
121 |
|
---|
122 | //System.out.println("*** last li a text = " + last_li_a_text);
|
---|
123 | //System.out.println("*** artist_url_str = " + artist_url_str);
|
---|
124 | }
|
---|
125 | }
|
---|
126 | catch (Exception e) {
|
---|
127 | e.printStackTrace();
|
---|
128 | }
|
---|
129 | }
|
---|
130 |
|
---|
131 |
|
---|
132 |
|
---|
133 | public static void main(String[] args)
|
---|
134 | {
|
---|
135 |
|
---|
136 | // Example artist page at AMC
|
---|
137 | // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
|
---|
138 |
|
---|
139 | String base_domain = "http://www.australianmusiccentre.com.au";
|
---|
140 | String base_url = "/search?type=work&sort=alphaTitleSort";
|
---|
141 | base_url += "&wfc[]=";
|
---|
142 |
|
---|
143 | followArtistPages(base_domain,base_url,"Roger+Dean");
|
---|
144 | }
|
---|
145 | }
|
---|