root/other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/generate-evened-out-10000-set.py @ 30005

Revision 30005, 5.6 KB (checked in by davidb, 5 years ago)

Initial cut at scripts for generating a 'boosted' ggenre set of files

  • Property svn:executable set to *
Line 
1#!/usr/bin/python
2
3import json
4import os
5from os.path import isfile, join
6
7import sys
8import csv
9
10import jamendo
11
12verbosity = 1
13dataset_size = 10000
14artist_limit = 20
15
16track_id_to_artist_id = {}
17
18evened_out_by_genre_distrib = {}
19evened_out_by_id_distrib    = {}
20evened_out_by_rank_distrib  = {}
21
22evened_out_unique_tracks = []
23evened_out_chosen_track  = {}
24evened_out_artist_limit  = {}
25
26def genre_sorted_by_freq(input_file,order='ascending'):
27    with open(input_file) as csvfile:
28        freq_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
29
30        # Remove first entry, which is text information
31        freq_reader.next()
32       
33        freq_data = []
34        for row in freq_reader:
35            genre=row[0].rstrip()
36            freq=row[1]
37            freq_data.append( (genre,freq))
38
39            if order == "descending":
40                freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=True)
41            else:
42                   freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=False)         
43   
44
45    return freq_data_sorted
46
47
48def process_boosted_genre(genre,rank_base,json_data):
49    results = json_data[u'results']
50
51    rank = 1 + rank_base
52   
53    for result in results:
54        id   = result[u'id'];
55        name = result[u'name'];
56        artist_id   = result[u'artist_id'];
57
58        track_id_to_artist_id[id] = artist_id
59
60        if (verbosity>=2):
61            print "genre: " + genre + ", id = " + id + ", name = " + result[u'name']
62
63        if evened_out_by_genre_distrib.get(id) == None:
64            evened_out_by_genre_distrib[genre] = {}
65        evened_out_by_genre_distrib[genre][id] = True;
66
67        if evened_out_by_id_distrib.get(id) == None:
68            evened_out_by_id_distrib[id] = []           
69        evened_out_by_id_distrib[id].append(genre)
70
71        if evened_out_by_rank_distrib.get(rank) == None:
72            evened_out_by_rank_distrib[rank] = []
73        evened_out_by_rank_distrib[rank].append(id)
74       
75        rank = rank + 1
76
77def evened_out_by_rank():
78    for rank_id in evened_out_by_rank_distrib:
79        rank_id_list = evened_out_by_rank_distrib[rank_id]
80        print "Rank: " + str(rank_id)
81        for track_id in rank_id_list:
82            artist_id = track_id_to_artist_id[track_id]
83           
84            if evened_out_artist_limit.get(artist_id) == None:
85                evened_out_artist_limit[artist_id] = 0
86               
87            evened_out_artist_limit[artist_id] = evened_out_artist_limit[artist_id] + 1
88            if (evened_out_chosen_track.get(track_id) == None):
89                           
90                if (evened_out_artist_limit[artist_id] < artist_limit):
91                    print "  Adding: " + track_id
92                    evened_out_unique_tracks.append(track_id)
93                    evened_out_chosen_track[track_id] = True
94                else:
95                    if verbosity >= 2:
96                        print "    Skipping track id due to artist limit: " + track_id
97            else:
98                if verbosity >= 2:
99                    print "    Skipping as track already chosen: " + track_id         
100
101            if len(evened_out_unique_tracks) == dataset_size:
102                print "Reached dataset size!"
103                break
104
105        if len(evened_out_unique_tracks) == dataset_size:
106            break
107
108def boosted_by_genre(genre,curr_freq_level):
109
110    for i in xrange(0, curr_freq_level, jamendo.chunk_size):
111        curr_chunk_size = curr_freq_level % jamendo.chunk_size if (i +jamendo.chunk_size)>curr_freq_level else jamendo.chunk_size
112
113        json_data = jamendo.download_from_jamendo(i,curr_chunk_size,
114                                                  "&tags="+genre+"&boost=popularity_total")
115        process_boosted_genre(genre,i,json_data)
116
117           
118
119def boosted_by_all_genres_descending(genre_freq_sorted,freq_levels):
120
121    curr_freq_level = freq_levels.pop(0);
122
123    break_count = 0
124   
125    for genrec in genre_freq_sorted:
126
127#        if break_count==2:
128#            break
129   
130        genre = genrec[0]
131        freq = int(genrec[1])
132
133##        while freq<2*curr_freq_level:
134        while freq<curr_freq_level:           
135            # under current limit
136            # => move on to next lower limit
137            # (but stop if freq_levels becomes empty)
138
139            if (len(freq_levels)==0):
140                break
141           
142            curr_freq_level = freq_levels.pop(0);
143
144        if freq>=curr_freq_level:
145               
146            # still in comfortable zone for selecting
147            print "boost by genre " + genre + ": get "+ str(curr_freq_level) + " from " + str(freq) + " entries"
148            boosted_by_genre(genre,curr_freq_level)
149
150        break_count = break_count + 1
151
152    print "*******"
153    print "* Number of tracks to choose from: " + str(len(evened_out_by_id_distrib))
154    print "*******"
155   
156       
157argc = len(sys.argv)
158
159input_file = sys.argv[1] if argc==2 else join("csv-data","genre-tags-freq.csv")
160
161genre_freq_sorted = genre_sorted_by_freq(input_file,order='descending')
162#genre_freq_sorted = genre_sorted_by_freq(input_file,order='ascending')
163
164#for row in genre_freq_sorted:
165#    print(','.join(row))
166
167
168freq_levels = [1000, 800, 600, 400, 300, 200, 100, 50]
169
170#freq_levels = [400, 200, 100, 10]
171#freq_levels = [10, 200, 400]
172
173boosted_by_all_genres_descending(genre_freq_sorted,freq_levels)
174
175
176evened_out_by_rank()
177
178with open("jamendo-evened-out-dataset-trackids.json", "w") as json_ofile:
179    json.dump(evened_out_unique_tracks,json_ofile)
Note: See TracBrowser for help on using the browser.