1 |
|
---|
2 | import java.io.*;
|
---|
3 | import java.net.*;
|
---|
4 | import java.nio.*;
|
---|
5 | import java.nio.charset.*;
|
---|
6 | import java.nio.channels.*;
|
---|
7 | import java.nio.file.*;
|
---|
8 | import java.util.*;
|
---|
9 |
|
---|
10 | import org.jsoup.Jsoup;
|
---|
11 | import org.jsoup.nodes.Document;
|
---|
12 | import org.jsoup.nodes.Element;
|
---|
13 | import org.jsoup.select.Elements;
|
---|
14 |
|
---|
15 |
|
---|
16 | class AMCArtistScrape
|
---|
17 | {
|
---|
18 |
|
---|
19 | private static void printf(String msg, Object... args)
|
---|
20 | {
|
---|
21 | System.out.println(String.format(msg, args));
|
---|
22 | }
|
---|
23 |
|
---|
24 | public static File urlToLocalFile(String url_str,String ext)
|
---|
25 | {
|
---|
26 |
|
---|
27 | String local_file_str = url_str.substring(url_str.lastIndexOf(File.separator)+1, url_str.length() );
|
---|
28 |
|
---|
29 | int ext_dot_pos = local_file_str.lastIndexOf('.');
|
---|
30 | String local_file_root_str = local_file_str;
|
---|
31 | if (ext_dot_pos>0) {
|
---|
32 | local_file_root_str = local_file_str.substring(0,ext_dot_pos);
|
---|
33 | }
|
---|
34 |
|
---|
35 | if (ext != null) {
|
---|
36 | local_file_str = local_file_root_str + ext;
|
---|
37 | }
|
---|
38 |
|
---|
39 | // Add in 'repair' style subdirectory
|
---|
40 | String repair_subdir = local_file_root_str.substring(local_file_root_str.length()-2);
|
---|
41 |
|
---|
42 | File local_dir = new File("audio",repair_subdir);
|
---|
43 | File local_file = null;
|
---|
44 | try {
|
---|
45 | if (!local_dir.exists()) {
|
---|
46 | local_dir.mkdir();
|
---|
47 | }
|
---|
48 |
|
---|
49 | local_file = new File(local_dir,local_file_str);
|
---|
50 | }
|
---|
51 | catch (Exception e) {
|
---|
52 | e.printStackTrace();
|
---|
53 | }
|
---|
54 |
|
---|
55 | return local_file;
|
---|
56 | }
|
---|
57 |
|
---|
58 | public static File urlToLocalFile(String url_str)
|
---|
59 | {
|
---|
60 | return urlToLocalFile(url_str,null);
|
---|
61 | }
|
---|
62 |
|
---|
63 |
|
---|
64 | public static void downloadURL(String url_str)
|
---|
65 | {
|
---|
66 |
|
---|
67 | try {
|
---|
68 | URL url = new URL(url_str);
|
---|
69 |
|
---|
70 | File local_file = urlToLocalFile(url_str);
|
---|
71 |
|
---|
72 | if (!local_file.exists()) {
|
---|
73 | System.out.printf("Downloading audio to '%s'...\n",local_file.getPath());
|
---|
74 | FileOutputStream fos = new FileOutputStream(local_file);
|
---|
75 |
|
---|
76 | ReadableByteChannel rbc = Channels.newChannel(url.openStream());
|
---|
77 | fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
|
---|
78 |
|
---|
79 | System.out.println("... done");
|
---|
80 | }
|
---|
81 | else {
|
---|
82 | System.out.printf("Skipping '%s' as is already exists\n",local_file.getPath());
|
---|
83 | }
|
---|
84 |
|
---|
85 | }
|
---|
86 | catch (Exception e) {
|
---|
87 | e.printStackTrace();
|
---|
88 | }
|
---|
89 | }
|
---|
90 |
|
---|
91 | public static void saveJSONMetadata(MusicRecord mr, String url_str)
|
---|
92 | {
|
---|
93 | File json_file = urlToLocalFile(url_str,".json");
|
---|
94 | String json_text = mr.toJSONString();
|
---|
95 |
|
---|
96 | try {
|
---|
97 | Files.write(Paths.get(json_file.getAbsolutePath()),json_text.getBytes(StandardCharsets.UTF_8));
|
---|
98 | }
|
---|
99 | catch (IOException e) {
|
---|
100 | e.printStackTrace();
|
---|
101 | }
|
---|
102 |
|
---|
103 | }
|
---|
104 |
|
---|
105 | public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc)
|
---|
106 | {
|
---|
107 |
|
---|
108 | ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
|
---|
109 |
|
---|
110 | Elements table_rows = doc.select("table.search_results tr");
|
---|
111 |
|
---|
112 | for (Element tr : table_rows) {
|
---|
113 |
|
---|
114 | Elements td_cells = tr.select("td");
|
---|
115 |
|
---|
116 | if (td_cells.size() == 6) {
|
---|
117 | Element title_anchor = td_cells.get(0).select("a").first();
|
---|
118 | String track_url = title_anchor.attr("href");
|
---|
119 | String title = title_anchor.ownText();
|
---|
120 | String instrumentation = td_cells.get(1).ownText();
|
---|
121 | String composer = td_cells.get(2).ownText();
|
---|
122 | String year = td_cells.get(3).ownText();
|
---|
123 |
|
---|
124 | Element audio_elem = td_cells.get(4).select("audio").first();
|
---|
125 | if (audio_elem != null) {
|
---|
126 | String audio_url = audio_elem.attr("src");
|
---|
127 |
|
---|
128 | MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
|
---|
129 | //System.out.println(mr.toJSONString());
|
---|
130 |
|
---|
131 | records.add(mr);
|
---|
132 | }
|
---|
133 |
|
---|
134 | }
|
---|
135 |
|
---|
136 | }
|
---|
137 |
|
---|
138 | return records;
|
---|
139 | }
|
---|
140 |
|
---|
141 | //private static void followArtistPages(String base_domain,String base_url, String artist_suffix)
|
---|
142 | private static void followArtistPages(String base_domain,String starting_href)
|
---|
143 | {
|
---|
144 | String artist_works_url_str = base_domain+starting_href;
|
---|
145 |
|
---|
146 | try {
|
---|
147 | boolean has_next_page = true;
|
---|
148 | int page_number = 1;
|
---|
149 |
|
---|
150 | while (has_next_page) {
|
---|
151 | System.out.println("Procesing Artist URL " + artist_works_url_str + ": Page " + page_number);
|
---|
152 |
|
---|
153 | Document doc = Jsoup.connect(artist_works_url_str).get();
|
---|
154 |
|
---|
155 | ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc);
|
---|
156 |
|
---|
157 | for (MusicRecord mr: music_records) {
|
---|
158 | String audio_url = mr.getAudioURL();
|
---|
159 | downloadURL(audio_url);
|
---|
160 |
|
---|
161 | saveJSONMetadata(mr,audio_url);
|
---|
162 |
|
---|
163 | //System.out.println(mr.toJSONString());
|
---|
164 |
|
---|
165 | }
|
---|
166 |
|
---|
167 | // <ul class="pagination">
|
---|
168 | // ...
|
---|
169 | // <li href="URL-TO-FOLLOW">Next</li>
|
---|
170 | // </ul>
|
---|
171 |
|
---|
172 | Elements pagination_block = doc.select("ul.pagination > li > a");
|
---|
173 | Element last_li_a = pagination_block.last();
|
---|
174 |
|
---|
175 | if (last_li_a != null) {
|
---|
176 | String last_li_a_text = last_li_a.text();
|
---|
177 | if (last_li_a_text.equals("Next >")) {
|
---|
178 | artist_works_url_str = base_domain+last_li_a.attr("href");
|
---|
179 | }
|
---|
180 | else {
|
---|
181 | has_next_page = false;
|
---|
182 | }
|
---|
183 | }
|
---|
184 | else {
|
---|
185 | has_next_page = false;
|
---|
186 | }
|
---|
187 |
|
---|
188 | page_number++;
|
---|
189 | }
|
---|
190 | }
|
---|
191 | catch (Exception e) {
|
---|
192 | e.printStackTrace();
|
---|
193 | }
|
---|
194 | }
|
---|
195 |
|
---|
196 |
|
---|
197 | /*
|
---|
198 | public static void pageScrapeArtist(String artist_suffix)
|
---|
199 | {
|
---|
200 |
|
---|
201 | // Example artist page at AMC
|
---|
202 | // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
|
---|
203 |
|
---|
204 | String base_domain = "http://www.australianmusiccentre.com.au";
|
---|
205 | String base_url = "/search?type=work&sort=alphaTitleSort";
|
---|
206 | base_url += "&wfc[]=";
|
---|
207 |
|
---|
208 | followArtistPages(base_domain,base_url+"Roger+Dean");
|
---|
209 | }
|
---|
210 | */
|
---|
211 |
|
---|
212 |
|
---|
213 | public static void main(String[] args)
|
---|
214 | {
|
---|
215 |
|
---|
216 | // Represented artists at AMC
|
---|
217 | // http://www.australianmusiccentre.com.au/artists
|
---|
218 |
|
---|
219 | String base_domain = "http://www.australianmusiccentre.com.au";
|
---|
220 | String base_url = "/artists";
|
---|
221 |
|
---|
222 | String represented_artists_url_str = base_domain+base_url;
|
---|
223 |
|
---|
224 | try {
|
---|
225 | Document doc = Jsoup.connect(represented_artists_url_str).get();
|
---|
226 |
|
---|
227 | Elements artist_paras = doc.select("p.artist_home_name");
|
---|
228 |
|
---|
229 | for (Element ap : artist_paras) {
|
---|
230 |
|
---|
231 | Element artist_a = ap.select("a").first();
|
---|
232 | if (artist_a != null) {
|
---|
233 | String artist_about_url_str = base_domain + artist_a.attr("href");
|
---|
234 |
|
---|
235 |
|
---|
236 | // Now process about this artist page to get to the sequence of pages
|
---|
237 | Document about_doc = Jsoup.connect(artist_about_url_str).get();
|
---|
238 |
|
---|
239 | Element browse_by_artist = about_doc.select("#content > div.inner > ul > li > a").first();
|
---|
240 | if (browse_by_artist != null) {
|
---|
241 |
|
---|
242 | String browse_works_href= browse_by_artist.attr("href");
|
---|
243 |
|
---|
244 | followArtistPages(base_domain,browse_works_href);
|
---|
245 | }
|
---|
246 | }
|
---|
247 | }
|
---|
248 | }
|
---|
249 | catch (Exception e) {
|
---|
250 | e.printStackTrace();
|
---|
251 | }
|
---|
252 |
|
---|
253 | }
|
---|
254 | }
|
---|