Changeset 30425 for other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java
- Timestamp:
- 2016-03-21T21:15:13+13:00 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java
r30418 r30425 3 3 import java.net.*; 4 4 import java.nio.*; 5 import java.nio.charset.*; 5 6 import java.nio.channels.*; 6 7 import java.nio.file.*; … … 21 22 } 22 23 23 public static void downloadURL(String url_str) { 24 public static File urlToLocalFile(String url_str,String ext) 25 { 26 27 String local_file_str = url_str.substring(url_str.lastIndexOf(File.separator)+1, url_str.length() ); 28 29 int ext_dot_pos = local_file_str.lastIndexOf('.'); 30 String local_file_root_str = local_file_str; 31 if (ext_dot_pos>0) { 32 local_file_root_str = local_file_str.substring(0,ext_dot_pos); 33 } 34 35 if (ext != null) { 36 local_file_str = local_file_root_str + ext; 37 } 38 39 // Add in 'repair' style subdirectory 40 String repair_subdir = local_file_root_str.substring(local_file_root_str.length()-2); 41 42 File local_dir = new File("audio",repair_subdir); 43 File local_file = null; 44 try { 45 if (!local_dir.exists()) { 46 local_dir.mkdir(); 47 } 48 49 local_file = new File(local_dir,local_file_str); 50 } 51 catch (Exception e) { 52 e.printStackTrace(); 53 } 54 55 return local_file; 56 } 57 58 public static File urlToLocalFile(String url_str) 59 { 60 return urlToLocalFile(url_str,null); 61 } 62 63 64 public static void downloadURL(String url_str) 65 { 24 66 25 67 try { 26 68 URL url = new URL(url_str); 27 69 28 String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() ); 29 File local_file = new File("audio",local_file_str); 70 File local_file = urlToLocalFile(url_str); 30 71 31 72 if (!local_file.exists()) { 32 System.out.printf("Downloading audio to '%s'...\n",local_file _str);73 System.out.printf("Downloading audio to '%s'...\n",local_file.getPath()); 33 74 FileOutputStream fos = new FileOutputStream(local_file); 34 75 … … 39 80 } 40 81 else { 41 System.out.printf("Skipping '%s' as is already exists\n",local_file_str); 42 } 43 44 } 45 catch (Exception e) { 46 e.printStackTrace(); 47 } 48 } 49 82 System.out.printf("Skipping '%s' as is already exists\n",local_file.getPath()); 83 } 84 85 } 86 catch (Exception e) { 87 e.printStackTrace(); 88 } 89 } 90 91 public static void saveJSONMetadata(MusicRecord mr, String url_str) 92 { 93 File json_file = urlToLocalFile(url_str,".json"); 94 String json_text = mr.toJSONString(); 95 96 try { 97 Files.write(Paths.get(json_file.getAbsolutePath()),json_text.getBytes(StandardCharsets.UTF_8)); 98 } 99 catch (IOException e) { 100 e.printStackTrace(); 101 } 102 103 } 104 50 105 public static ArrayList<MusicRecord> extractArtistRecordsFromPage(Document doc) 51 106 { … … 72 127 73 128 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url); 74 System.out.println(mr.toJSONString());129 //System.out.println(mr.toJSONString()); 75 130 76 131 records.add(mr); … … 84 139 } 85 140 86 private static void followArtistPages(String base_domain,String base_url, String artist_suffix) 87 { 88 String artist_url_str = base_domain+base_url+artist_suffix; 141 //private static void followArtistPages(String base_domain,String base_url, String artist_suffix) 142 private static void followArtistPages(String base_domain,String starting_href) 143 { 144 String artist_works_url_str = base_domain+starting_href; 89 145 90 146 try { … … 93 149 94 150 while (has_next_page) { 95 System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number);96 97 Document doc = Jsoup.connect(artist_ url_str).get();151 System.out.println("Procesing Artist URL " + artist_works_url_str + ": Page " + page_number); 152 153 Document doc = Jsoup.connect(artist_works_url_str).get(); 98 154 99 155 ArrayList<MusicRecord> music_records = extractArtistRecordsFromPage(doc); 100 156 101 157 for (MusicRecord mr: music_records) { 102 downloadURL(mr.getAudioURL()); 158 String audio_url = mr.getAudioURL(); 159 downloadURL(audio_url); 160 161 saveJSONMetadata(mr,audio_url); 162 163 //System.out.println(mr.toJSONString()); 164 103 165 } 104 166 … … 110 172 Elements pagination_block = doc.select("ul.pagination > li > a"); 111 173 Element last_li_a = pagination_block.last(); 112 String last_li_a_text = last_li_a.text(); 113 if (last_li_a_text.equals("Next >")) { 114 artist_url_str = base_domain+last_li_a.attr("href"); 174 175 if (last_li_a != null) { 176 String last_li_a_text = last_li_a.text(); 177 if (last_li_a_text.equals("Next >")) { 178 artist_works_url_str = base_domain+last_li_a.attr("href"); 179 } 180 else { 181 has_next_page = false; 182 } 115 183 } 116 184 else { … … 119 187 120 188 page_number++; 121 122 //System.out.println("*** last li a text = " + last_li_a_text);123 //System.out.println("*** artist_url_str = " + artist_url_str);124 189 } 125 190 } … … 130 195 131 196 132 133 public static void main(String[] args)197 /* 198 public static void pageScrapeArtist(String artist_suffix) 134 199 { 135 200 … … 141 206 base_url += "&wfc[]="; 142 207 143 followArtistPages(base_domain,base_url,"Roger+Dean"); 208 followArtistPages(base_domain,base_url+"Roger+Dean"); 209 } 210 */ 211 212 213 public static void main(String[] args) 214 { 215 216 // Represented artists at AMC 217 // http://www.australianmusiccentre.com.au/artists 218 219 String base_domain = "http://www.australianmusiccentre.com.au"; 220 String base_url = "/artists"; 221 222 String represented_artists_url_str = base_domain+base_url; 223 224 try { 225 Document doc = Jsoup.connect(represented_artists_url_str).get(); 226 227 Elements artist_paras = doc.select("p.artist_home_name"); 228 229 for (Element ap : artist_paras) { 230 231 Element artist_a = ap.select("a").first(); 232 if (artist_a != null) { 233 String artist_about_url_str = base_domain + artist_a.attr("href"); 234 235 236 // Now process about this artist page to get to the sequence of pages 237 Document about_doc = Jsoup.connect(artist_about_url_str).get(); 238 239 Element browse_by_artist = about_doc.select("#content > div.inner > ul > li > a").first(); 240 if (browse_by_artist != null) { 241 242 String browse_works_href= browse_by_artist.attr("href"); 243 244 followArtistPages(base_domain,browse_works_href); 245 } 246 } 247 } 248 } 249 catch (Exception e) { 250 e.printStackTrace(); 251 } 252 144 253 } 145 254 }
Note:
See TracChangeset
for help on using the changeset viewer.