root/other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/get-nc-sa-all.py @ 30005

Revision 30005, 2.3 KB (checked in by davidb, 5 years ago)

Initial cut at scripts for generating a 'boosted' ggenre set of files

  • Property svn:executable set to *
Line 
1#!/usr/bin/python
2
3import json
4import os
5import os.path
6import urllib
7import sys
8
9
10chunk_size=200
11
12base_url = "http://api.jamendo.com/v3.0/tracks/"
13base_url_args = base_url + "?client_id=54cc3f68&format=json&audioformat=mp32&audiodlformat=flac&ccnc=1&ccsa=1"
14
15# OK, there are also the include fields 'licenses' and 'lyrics' but for GCUX, there are currently considered unnecessary
16
17base_url_args = base_url_args + "&include=musicinfo+stats"
18
19chunk_url = base_url_args + "&limit=" + str(chunk_size)
20
21output_dir = "download-json-all"
22
23if not os.path.isdir(output_dir):
24    print "Creating directory: " + output_dir
25    os.mkdir(output_dir)
26   
27
28more_to_download = 1
29offset = 0
30chunk_count = 0
31
32error_count = 0
33
34#171800 (example of an argv[1] values?)
35#
36# but note, as currently written, the code makes use of 'offset' but not the derived 'chunk_offset'
37
38
39if (len(sys.argv)==2):
40    offset=argv[1]
41    chunk_offset=offset//chunk_size
42
43while (more_to_download) :
44
45    output_filename = os.path.join(output_dir ,"nc-sa-chunk-{0:03d}.json".format(chunk_count))
46
47    if os.path.isfile(output_filename):
48        print "Skipping Offset = " + str(offset) + " as downloaded file already exists"
49        offset += chunk_size;
50        chunk_count += 1;
51        continue
52
53    download_url = chunk_url + "&offset=" + str(offset)
54    print "Downloading: " + download_url
55    download_url_handle = urllib.urlopen(download_url)
56
57    json_data = download_url_handle.read()
58
59    with open(output_filename, "w") as json_ofile:
60        json_ofile.write(json_data)
61        json_ofile.close()
62
63    try:
64        data = json.loads(json_data)
65
66        headers = data[u'headers']
67        print "  Status: " + headers[u"status"]
68
69        results_count = headers[u'results_count']
70
71
72        offset += results_count
73
74        if results_count != chunk_size:
75            more_to_download = 0
76
77
78    except:
79        print "Warning: failed to process Offset = " + str(offset) + " (chunk = " + str(chunk_count) + ")"
80        print "Assuming failed block was standard size (" + str(chunk_size) + ")"
81        offset += chunk_size
82        error_count = error_count + 1
83
84        if error_count >= 10:
85            more_to_download = 0
86
87
88    chunk_count += 1
89
90print "====="
91print "Retrieved {0} Non-Copyright Share-Alike tracks from Jamendo".format(offset)
92print "====="
Note: See TracBrowser for help on using the browser.