Changeset 30418 for other-projects
- Timestamp:
- 2016-03-21T15:02:26+13:00 (8 years ago)
- Location:
- other-projects/mars-music-recommender/trunk/amc-music-scrape/src
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java
r30416 r30418 16 16 { 17 17 18 private static void printf(String msg, Object... args) 19 { 20 System.out.println(String.format(msg, args)); 21 } 22 18 23 public static void downloadURL(String url_str) { 19 24 … … 22 27 23 28 String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() ); 24 File local_file = new File( local_file_str);29 File local_file = new File("audio",local_file_str); 25 30 26 31 if (!local_file.exists()) { … … 43 48 } 44 49 45 public static ArrayList<MusicRecord> extract Results(String urlstr)50 public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc) 46 51 { 47 52 48 53 ArrayList<MusicRecord> records = new ArrayList<MusicRecord>(); 49 54 55 Elements table_rows = doc.select("table.search_results tr"); 56 57 for (Element tr : table_rows) { 58 59 Elements td_cells = tr.select("td"); 60 61 if (td_cells.size() == 6) { 62 Element title_anchor = td_cells.get(0).select("a").first(); 63 String track_url = title_anchor.attr("href"); 64 String title = title_anchor.ownText(); 65 String instrumentation = td_cells.get(1).ownText(); 66 String composer = td_cells.get(2).ownText(); 67 String year = td_cells.get(3).ownText(); 68 69 Element audio_elem = td_cells.get(4).select("audio").first(); 70 if (audio_elem != null) { 71 String audio_url = audio_elem.attr("src"); 72 73 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url); 74 System.out.println(mr.toJSONString()); 75 76 records.add(mr); 77 } 78 79 } 80 81 } 82 83 return records; 84 } 85 86 private static void followArtistPages(String base_domain,String base_url, String artist_suffix) 87 { 88 String artist_url_str = base_domain+base_url+artist_suffix; 89 50 90 try { 51 Document doc = Jsoup.connect(urlstr).get(); 52 53 Elements table_rows = doc.select("table.search_results tr"); 91 boolean has_next_page = true; 92 int page_number = 1; 54 93 55 56 57 for (Element tr : table_rows) { 94 while (has_next_page) { 95 System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number); 58 96 59 Elements td_cells = tr.select("td");97 Document doc = Jsoup.connect(artist_url_str).get(); 60 98 61 if (td_cells.size() == 6) { 62 Element title_anchor = td_cells.get(0).select("a").first(); 63 String track_url = title_anchor.attr("href"); 64 String title = title_anchor.ownText(); 65 String instrumentation = td_cells.get(1).ownText(); 66 String composer = td_cells.get(2).ownText(); 67 String year = td_cells.get(3).ownText(); 99 ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc); 100 101 for (MusicRecord mr: music_records) { 102 downloadURL(mr.getAudioURL()); 103 } 68 104 69 Element audio_elem = td_cells.get(4).select("audio").first(); 70 if (audio_elem != null) { 71 String audio_url = audio_elem.attr("src"); 105 // <ul class="pagination"> 106 // ... 107 // <li href="URL-TO-FOLLOW">Next</li> 108 // </ul> 72 109 73 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url); 74 System.out.println(mr); 110 Elements pagination_block = doc.select("ul.pagination > li > a"); 111 Element last_li_a = pagination_block.last(); 112 String last_li_a_text = last_li_a.text(); 113 if (last_li_a_text.equals("Next >")) { 114 artist_url_str = base_domain+last_li_a.attr("href"); 115 } 116 else { 117 has_next_page = false; 118 } 75 119 76 records.add(mr); 77 } 78 79 } 80 120 page_number++; 121 122 //System.out.println("*** last li a text = " + last_li_a_text); 123 //System.out.println("*** artist_url_str = " + artist_url_str); 81 124 } 82 125 } … … 84 127 e.printStackTrace(); 85 128 } 86 87 return records;88 129 } 89 130 90 private static void printf(String msg, Object... args)91 {92 System.out.println(String.format(msg, args));93 }94 131 95 132 … … 97 134 { 98 135 99 // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean 100 String baseUrl = "http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort"; 101 baseUrl += "&wfc[]="; 136 // Example artist page at AMC 137 // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean 102 138 103 ArrayList<MusicRecord> music_records = extractResults(baseUrl+"Roger+Dean"); 139 String base_domain = "http://www.australianmusiccentre.com.au"; 140 String base_url = "/search?type=work&sort=alphaTitleSort"; 141 base_url += "&wfc[]="; 104 142 105 for (MusicRecord mr: music_records) { 106 downloadURL(mr.getAudioURL()); 107 } 143 followArtistPages(base_domain,base_url,"Roger+Dean"); 108 144 } 109 145 } -
other-projects/mars-music-recommender/trunk/amc-music-scrape/src/MusicRecord.java
r30416 r30418 7 7 import java.util.*; 8 8 9 import org.json.simple.JSONArray; 10 import org.json.simple.JSONObject; 9 11 10 12 class MusicRecord { … … 33 35 title_,artist_,year_, track_url_, audio_url_); 34 36 } 35 37 38 public JSONObject toJSON() 39 { 40 JSONObject obj = new JSONObject(); 41 42 boolean object_is_all_null = true; 43 44 if (title_ != null) { 45 obj.put("title",title_); 46 object_is_all_null = false; 47 } 48 49 if (artist_ != null) { 50 obj.put("artist",artist_); 51 object_is_all_null = false; 52 } 53 54 if (year_ != null) { 55 obj.put("year",year_); 56 object_is_all_null = false; 57 } 58 59 if (track_url_ != null) { 60 obj.put("track_url",track_url_); 61 object_is_all_null = false; 62 } 63 64 if (audio_url_ != null) { 65 obj.put("audio_url",audio_url_); 66 object_is_all_null = false; 67 } 68 69 if (object_is_all_null) { 70 System.err.println("Warning: MusicRecord '" + this + "' only has null values in it"); 71 } 72 73 return obj; 74 } 75 76 public String toJSONString() 77 { 78 String jsonText = null; 79 JSONObject obj = toJSON(); 80 81 try { 82 StringWriter out = new StringWriter(); 83 obj.writeJSONString(out); 84 85 jsonText = out.toString(); 86 } 87 catch (IOException ioe) { 88 ioe.printStackTrace(); 89 } 90 91 return jsonText; 92 } 93 94 36 95 } 37 96
Note:
See TracChangeset
for help on using the changeset viewer.