source: other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java@ 30418

Last change on this file since 30418 was 30418, checked in by davidb, 8 years ago

Code updated to work through a sequence of pages for one artist

  • Property svn:executable set to *
File size: 3.7 KB
Line 
1
2import java.io.*;
3import java.net.*;
4import java.nio.*;
5import java.nio.channels.*;
6import java.nio.file.*;
7import java.util.*;
8
9import org.jsoup.Jsoup;
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
12import org.jsoup.select.Elements;
13
14
15class AMCArtistScrape
16{
17
18 private static void printf(String msg, Object... args)
19 {
20 System.out.println(String.format(msg, args));
21 }
22
23 public static void downloadURL(String url_str) {
24
25 try {
26 URL url = new URL(url_str);
27
28 String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() );
29 File local_file = new File("audio",local_file_str);
30
31 if (!local_file.exists()) {
32 System.out.printf("Downloading audio to '%s'...\n",local_file_str);
33 FileOutputStream fos = new FileOutputStream(local_file);
34
35 ReadableByteChannel rbc = Channels.newChannel(url.openStream());
36 fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
37
38 System.out.println("... done");
39 }
40 else {
41 System.out.printf("Skipping '%s' as is already exists\n",local_file_str);
42 }
43
44 }
45 catch (Exception e) {
46 e.printStackTrace();
47 }
48 }
49
50 public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc)
51 {
52
53 ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
54
55 Elements table_rows = doc.select("table.search_results tr");
56
57 for (Element tr : table_rows) {
58
59 Elements td_cells = tr.select("td");
60
61 if (td_cells.size() == 6) {
62 Element title_anchor = td_cells.get(0).select("a").first();
63 String track_url = title_anchor.attr("href");
64 String title = title_anchor.ownText();
65 String instrumentation = td_cells.get(1).ownText();
66 String composer = td_cells.get(2).ownText();
67 String year = td_cells.get(3).ownText();
68
69 Element audio_elem = td_cells.get(4).select("audio").first();
70 if (audio_elem != null) {
71 String audio_url = audio_elem.attr("src");
72
73 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
74 System.out.println(mr.toJSONString());
75
76 records.add(mr);
77 }
78
79 }
80
81 }
82
83 return records;
84 }
85
86 private static void followArtistPages(String base_domain,String base_url, String artist_suffix)
87 {
88 String artist_url_str = base_domain+base_url+artist_suffix;
89
90 try {
91 boolean has_next_page = true;
92 int page_number = 1;
93
94 while (has_next_page) {
95 System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number);
96
97 Document doc = Jsoup.connect(artist_url_str).get();
98
99 ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc);
100
101 for (MusicRecord mr: music_records) {
102 downloadURL(mr.getAudioURL());
103 }
104
105 // <ul class="pagination">
106 // ...
107 // <li href="URL-TO-FOLLOW">Next</li>
108 // </ul>
109
110 Elements pagination_block = doc.select("ul.pagination > li > a");
111 Element last_li_a = pagination_block.last();
112 String last_li_a_text = last_li_a.text();
113 if (last_li_a_text.equals("Next >")) {
114 artist_url_str = base_domain+last_li_a.attr("href");
115 }
116 else {
117 has_next_page = false;
118 }
119
120 page_number++;
121
122 //System.out.println("*** last li a text = " + last_li_a_text);
123 //System.out.println("*** artist_url_str = " + artist_url_str);
124 }
125 }
126 catch (Exception e) {
127 e.printStackTrace();
128 }
129 }
130
131
132
133 public static void main(String[] args)
134 {
135
136 // Example artist page at AMC
137 // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
138
139 String base_domain = "http://www.australianmusiccentre.com.au";
140 String base_url = "/search?type=work&sort=alphaTitleSort";
141 base_url += "&wfc[]=";
142
143 followArtistPages(base_domain,base_url,"Roger+Dean");
144 }
145}
Note: See TracBrowser for help on using the repository browser.