source: other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/get-nc-sa-all.py@ 30006

Last change on this file since 30006 was 30006, checked in by davidb, 9 years ago

Changed to use 'true' rather than '1' which is what the API desclares the type of this attribute to be

  • Property svn:executable set to *
File size: 2.3 KB
Line 
1#!/usr/bin/python
2
3import json
4import os
5import os.path
6import urllib
7import sys
8
9
10chunk_size=200
11
12base_url = "http://api.jamendo.com/v3.0/tracks/"
13base_url_args = base_url + "?client_id=54cc3f68&format=jsonpretty&audioformat=mp32&audiodlformat=flac&ccnc=true&ccsa=true"
14
15# OK, there are also the include fields 'licenses' and 'lyrics' but for GCUX, there are currently considered unnecessary
16
17base_url_args = base_url_args + "&include=musicinfo+stats"
18
19chunk_url = base_url_args + "&limit=" + str(chunk_size)
20
21output_dir = "download-json-all"
22
23if not os.path.isdir(output_dir):
24 print "Creating directory: " + output_dir
25 os.mkdir(output_dir)
26
27
28more_to_download = 1
29offset = 0
30chunk_count = 0
31
32error_count = 0
33
34#171800 (example of an argv[1] values?)
35#
36# but note, as currently written, the code makes use of 'offset' but not the derived 'chunk_offset'
37
38
39if (len(sys.argv)==2):
40 offset=argv[1]
41 chunk_offset=offset//chunk_size
42
43while (more_to_download) :
44
45 output_filename = os.path.join(output_dir ,"nc-sa-chunk-{0:03d}.json".format(chunk_count))
46
47 if os.path.isfile(output_filename):
48 print "Skipping Offset = " + str(offset) + " as downloaded file already exists"
49 offset += chunk_size;
50 chunk_count += 1;
51 continue
52
53 download_url = chunk_url + "&offset=" + str(offset)
54 print "Downloading: " + download_url
55 download_url_handle = urllib.urlopen(download_url)
56
57 json_data = download_url_handle.read()
58
59 with open(output_filename, "w") as json_ofile:
60 json_ofile.write(json_data)
61 json_ofile.close()
62
63 try:
64 data = json.loads(json_data)
65
66 headers = data[u'headers']
67 print " Status: " + headers[u"status"]
68
69 results_count = headers[u'results_count']
70
71
72 offset += results_count
73
74 if results_count != chunk_size:
75 more_to_download = 0
76
77
78 except:
79 print "Warning: failed to process Offset = " + str(offset) + " (chunk = " + str(chunk_count) + ")"
80 print "Assuming failed block was standard size (" + str(chunk_size) + ")"
81 offset += chunk_size
82 error_count = error_count + 1
83
84 if error_count >= 10:
85 more_to_download = 0
86
87
88 chunk_count += 1
89
90print "====="
91print "Retrieved {0} Non-Copyright Share-Alike tracks from Jamendo".format(offset)
92print "====="
Note: See TracBrowser for help on using the repository browser.