source: other-projects/mars-music-recommender/trunk/amc-music-scrape/src/AMCArtistScrape.java@ 30417

Last change on this file since 30417 was 30416, checked in by davidb, 8 years ago

Initial cut at code for scraping music excerpts from AMC site

  • Property svn:executable set to *
File size: 2.7 KB
Line 
1
2import java.io.*;
3import java.net.*;
4import java.nio.*;
5import java.nio.channels.*;
6import java.nio.file.*;
7import java.util.*;
8
9import org.jsoup.Jsoup;
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
12import org.jsoup.select.Elements;
13
14
15class AMCArtistScrape
16{
17
18 public static void downloadURL(String url_str) {
19
20 try {
21 URL url = new URL(url_str);
22
23 String local_file_str = url_str.substring(url_str.lastIndexOf('/')+1, url_str.length() );
24 File local_file = new File(local_file_str);
25
26 if (!local_file.exists()) {
27 System.out.printf("Downloading audio to '%s'...\n",local_file_str);
28 FileOutputStream fos = new FileOutputStream(local_file);
29
30 ReadableByteChannel rbc = Channels.newChannel(url.openStream());
31 fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
32
33 System.out.println("... done");
34 }
35 else {
36 System.out.printf("Skipping '%s' as is already exists\n",local_file_str);
37 }
38
39 }
40 catch (Exception e) {
41 e.printStackTrace();
42 }
43 }
44
45 public static ArrayList<MusicRecord> extractResults(String urlstr)
46 {
47
48 ArrayList<MusicRecord> records = new ArrayList<MusicRecord>();
49
50 try {
51 Document doc = Jsoup.connect(urlstr).get();
52
53 Elements table_rows = doc.select("table.search_results tr");
54
55
56
57 for (Element tr : table_rows) {
58
59 Elements td_cells = tr.select("td");
60
61 if (td_cells.size() == 6) {
62 Element title_anchor = td_cells.get(0).select("a").first();
63 String track_url = title_anchor.attr("href");
64 String title = title_anchor.ownText();
65 String instrumentation = td_cells.get(1).ownText();
66 String composer = td_cells.get(2).ownText();
67 String year = td_cells.get(3).ownText();
68
69 Element audio_elem = td_cells.get(4).select("audio").first();
70 if (audio_elem != null) {
71 String audio_url = audio_elem.attr("src");
72
73 MusicRecord mr = new MusicRecord(track_url,title,composer,year,audio_url);
74 System.out.println(mr);
75
76 records.add(mr);
77 }
78
79 }
80
81 }
82 }
83 catch (Exception e) {
84 e.printStackTrace();
85 }
86
87 return records;
88 }
89
90 private static void printf(String msg, Object... args)
91 {
92 System.out.println(String.format(msg, args));
93 }
94
95
96 public static void main(String[] args)
97 {
98
99 // http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort&wfc[]=Roger+Dean
100 String baseUrl = "http://www.australianmusiccentre.com.au/search?type=work&sort=alphaTitleSort";
101 baseUrl += "&wfc[]=";
102
103 ArrayList<MusicRecord> music_records = extractResults(baseUrl+"Roger+Dean");
104
105 for (MusicRecord mr: music_records) {
106 downloadURL(mr.getAudioURL());
107 }
108 }
109}
Note: See TracBrowser for help on using the repository browser.