import java.io.*; import java.net.*; import java.nio.*; import java.nio.channels.*; import java.nio.file.*; import java.util.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; class AMCArtistScrape { private static void printf(String msg, Object... args) { System.out.println(String.format(msg, args)); } public static void downloadURL(String url_str) { try { URL url = new URL(url_str); String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() ); File local_file = new File("audio",local_file_str); if (!local_file.exists()) { System.out.printf("Downloading audio to '%s'...\n",local_file_str); FileOutputStream fos = new FileOutputStream(local_file); ReadableByteChannel rbc = Channels.newChannel(url.openStream()); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); System.out.println("... done"); } else { System.out.printf("Skipping '%s' as is already exists\n",local_file_str); } } catch (Exception e) { e.printStackTrace(); } } public static ArrayList extractArtistRecordsFromPage(Document doc) { ArrayList records = new ArrayList(); Elements table_rows = doc.select("table.search_results tr"); for (Element tr : table_rows) { Elements td_cells = tr.select("td"); if (td_cells.size() == 6) { Element title_anchor = td_cells.get(0).select("a").first(); String track_url = title_anchor.attr("href"); String title = title_anchor.ownText(); String instrumentation = td_cells.get(1).ownText(); String composer = td_cells.get(2).ownText(); String year = td_cells.get(3).ownText(); Element audio_elem = td_cells.get(4).select("audio").first(); if (audio_elem != null) { String audio_url = audio_elem.attr("src"); MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url); System.out.println(mr.toJSONString()); records.add(mr); } } } return records; } private static void followArtistPages(String base_domain,String base_url, String artist_suffix) { String artist_url_str = base_domain+base_url+artist_suffix; try { boolean has_next_page = true; int page_number = 1; while (has_next_page) { System.out.println("Procesing Artist suffix '" + artist_suffix + "': Page " + page_number); Document doc = Jsoup.connect(artist_url_str).get(); ArrayList music_records = extractArtistRecordsFromPage(doc); for (MusicRecord mr: music_records) { downloadURL(mr.getAudioURL()); } //
    // ... //
  • Next
  • //
Elements pagination_block = doc.select("ul.pagination > li > a"); Element last_li_a = pagination_block.last(); String last_li_a_text = last_li_a.text(); if (last_li_a_text.equals("Next >")) { artist_url_str = base_domain+last_li_a.attr("href"); } else { has_next_page = false; } page_number++; //System.out.println("*** last li a text = " + last_li_a_text); //System.out.println("*** artist_url_str = " + artist_url_str); } } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { // Example artist page at AMC // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean String base_domain = "http://www.australianmusiccentre.com.au"; String base_url = "/search?type=work&sort=alphaTitleSort"; base_url += "&wfc[]="; followArtistPages(base_domain,base_url,"Roger+Dean"); } }