source: other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java@ 30425

Last change on this file since 30425 was 30425, checked in by davidb, 8 years ago

Save metadata as JSON file. Create sub-directories to spreadout the generated files

  • Property svn:executable set to *
File size: 6.2 KB
Line 
1
2import java.io.*;
3import java.net.*;
4import java.nio.*;
5import java.nio.charset.*;
6import java.nio.channels.*;
7import java.nio.file.*;
8import java.util.*;
9
10import org.jsoup.Jsoup;
11import org.jsoup.nodes.Document;
12import org.jsoup.nodes.Element;
13import org.jsoup.select.Elements;
14
15
16class AMCArtistScrape
17{
18
19 private static void printf(String msg, Object... args)
20 {
21 System.out.println(String.format(msg, args));
22 }
23
24 public static File urlToLocalFile(String url_str,String ext)
25 {
26
27 String local_file_str = url_str.substring(url_str.lastIndexOf(File.separator)+1, url_str.length() );
28
29 int ext_dot_pos = local_file_str.lastIndexOf('.');
30 String local_file_root_str = local_file_str;
31 if (ext_dot_pos>0) {
32 local_file_root_str = local_file_str.substring(0,ext_dot_pos);
33 }
34
35 if (ext != null) {
36 local_file_str = local_file_root_str + ext;
37 }
38
39 // Add in 'repair' style subdirectory
40 String repair_subdir = local_file_root_str.substring(local_file_root_str.length()-2);
41
42 File local_dir = new File("audio",repair_subdir);
43 File local_file = null;
44 try {
45 if (!local_dir.exists()) {
46 local_dir.mkdir();
47 }
48
49 local_file = new File(local_dir,local_file_str);
50 }
51 catch (Exception e) {
52 e.printStackTrace();
53 }
54
55 return local_file;
56 }
57
58 public static File urlToLocalFile(String url_str)
59 {
60 return urlToLocalFile(url_str,null);
61 }
62
63
64 public static void downloadURL(String url_str)
65 {
66
67 try {
68 URL url = new URL(url_str);
69
70 File local_file = urlToLocalFile(url_str);
71
72 if (!local_file.exists()) {
73 System.out.printf("Downloading audio to '%s'...\n",local_file.getPath());
74 FileOutputStream fos = new FileOutputStream(local_file);
75
76 ReadableByteChannel rbc = Channels.newChannel(url.openStream());
77 fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
78
79 System.out.println("... done");
80 }
81 else {
82 System.out.printf("Skipping '%s' as is already exists\n",local_file.getPath());
83 }
84
85 }
86 catch (Exception e) {
87 e.printStackTrace();
88 }
89 }
90
91 public static void saveJSONMetadata(MusicRecord mr, String url_str)
92 {
93 File json_file = urlToLocalFile(url_str,".json");
94 String json_text = mr.toJSONString();
95
96 try {
97 Files.write(Paths.get(json_file.getAbsolutePath()),json_text.getBytes(StandardCharsets.UTF_8));
98 }
99 catch (IOException e) {
100 e.printStackTrace();
101 }
102
103 }
104
105 public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc)
106 {
107
108 ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
109
110 Elements table_rows = doc.select("table.search_results tr");
111
112 for (Element tr : table_rows) {
113
114 Elements td_cells = tr.select("td");
115
116 if (td_cells.size() == 6) {
117 Element title_anchor = td_cells.get(0).select("a").first();
118 String track_url = title_anchor.attr("href");
119 String title = title_anchor.ownText();
120 String instrumentation = td_cells.get(1).ownText();
121 String composer = td_cells.get(2).ownText();
122 String year = td_cells.get(3).ownText();
123
124 Element audio_elem = td_cells.get(4).select("audio").first();
125 if (audio_elem != null) {
126 String audio_url = audio_elem.attr("src");
127
128 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
129 //System.out.println(mr.toJSONString());
130
131 records.add(mr);
132 }
133
134 }
135
136 }
137
138 return records;
139 }
140
141 //private static void followArtistPages(String base_domain,String base_url, String artist_suffix)
142 private static void followArtistPages(String base_domain,String starting_href)
143 {
144 String artist_works_url_str = base_domain+starting_href;
145
146 try {
147 boolean has_next_page = true;
148 int page_number = 1;
149
150 while (has_next_page) {
151 System.out.println("Procesing Artist URL " + artist_works_url_str + ": Page " + page_number);
152
153 Document doc = Jsoup.connect(artist_works_url_str).get();
154
155 ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc);
156
157 for (MusicRecord mr: music_records) {
158 String audio_url = mr.getAudioURL();
159 downloadURL(audio_url);
160
161 saveJSONMetadata(mr,audio_url);
162
163 //System.out.println(mr.toJSONString());
164
165 }
166
167 // <ul class="pagination">
168 // ...
169 // <li href="URL-TO-FOLLOW">Next</li>
170 // </ul>
171
172 Elements pagination_block = doc.select("ul.pagination > li > a");
173 Element last_li_a = pagination_block.last();
174
175 if (last_li_a != null) {
176 String last_li_a_text = last_li_a.text();
177 if (last_li_a_text.equals("Next >")) {
178 artist_works_url_str = base_domain+last_li_a.attr("href");
179 }
180 else {
181 has_next_page = false;
182 }
183 }
184 else {
185 has_next_page = false;
186 }
187
188 page_number++;
189 }
190 }
191 catch (Exception e) {
192 e.printStackTrace();
193 }
194 }
195
196
197 /*
198 public static void pageScrapeArtist(String artist_suffix)
199 {
200
201 // Example artist page at AMC
202 // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
203
204 String base_domain = "http://www.australianmusiccentre.com.au";
205 String base_url = "/search?type=work&sort=alphaTitleSort";
206 base_url += "&wfc[]=";
207
208 followArtistPages(base_domain,base_url+"Roger+Dean");
209 }
210 */
211
212
213 public static void main(String[] args)
214 {
215
216 // Represented artists at AMC
217 // http://www.australianmusiccentre.com.au/artists
218
219 String base_domain = "http://www.australianmusiccentre.com.au";
220 String base_url = "/artists";
221
222 String represented_artists_url_str = base_domain+base_url;
223
224 try {
225 Document doc = Jsoup.connect(represented_artists_url_str).get();
226
227 Elements artist_paras = doc.select("p.artist_home_name");
228
229 for (Element ap : artist_paras) {
230
231 Element artist_a = ap.select("a").first();
232 if (artist_a != null) {
233 String artist_about_url_str = base_domain + artist_a.attr("href");
234
235
236 // Now process about this artist page to get to the sequence of pages
237 Document about_doc = Jsoup.connect(artist_about_url_str).get();
238
239 Element browse_by_artist = about_doc.select("#content > div.inner > ul > li > a").first();
240 if (browse_by_artist != null) {
241
242 String browse_works_href= browse_by_artist.attr("href");
243
244 followArtistPages(base_domain,browse_works_href);
245 }
246 }
247 }
248 }
249 catch (Exception e) {
250 e.printStackTrace();
251 }
252
253 }
254}
Note: See TracBrowser for help on using the repository browser.