[30005] | 1 | #!/usr/bin/python
|
---|
| 2 |
|
---|
| 3 | import json
|
---|
| 4 | import os
|
---|
| 5 | from os.path import isfile, join
|
---|
| 6 |
|
---|
| 7 | import sys
|
---|
| 8 | import csv
|
---|
| 9 |
|
---|
| 10 | import jamendo
|
---|
| 11 |
|
---|
| 12 | verbosity = 1
|
---|
| 13 | dataset_size = 10000
|
---|
| 14 | artist_limit = 20
|
---|
| 15 |
|
---|
| 16 | track_id_to_artist_id = {}
|
---|
| 17 |
|
---|
| 18 | evened_out_by_genre_distrib = {}
|
---|
| 19 | evened_out_by_id_distrib = {}
|
---|
| 20 | evened_out_by_rank_distrib = {}
|
---|
| 21 |
|
---|
| 22 | evened_out_unique_tracks = []
|
---|
| 23 | evened_out_chosen_track = {}
|
---|
| 24 | evened_out_artist_limit = {}
|
---|
| 25 |
|
---|
| 26 | def genre_sorted_by_freq(input_file,order='ascending'):
|
---|
| 27 | with open(input_file) as csvfile:
|
---|
| 28 | freq_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
|
---|
| 29 |
|
---|
| 30 | # Remove first entry, which is text information
|
---|
| 31 | freq_reader.next()
|
---|
| 32 |
|
---|
| 33 | freq_data = []
|
---|
| 34 | for row in freq_reader:
|
---|
| 35 | genre=row[0].rstrip()
|
---|
| 36 | freq=row[1]
|
---|
| 37 | freq_data.append( (genre,freq))
|
---|
| 38 |
|
---|
| 39 | if order == "descending":
|
---|
| 40 | freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=True)
|
---|
| 41 | else:
|
---|
| 42 | freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=False)
|
---|
| 43 |
|
---|
| 44 |
|
---|
| 45 | return freq_data_sorted
|
---|
| 46 |
|
---|
| 47 |
|
---|
| 48 | def process_boosted_genre(genre,rank_base,json_data):
|
---|
| 49 | results = json_data[u'results']
|
---|
| 50 |
|
---|
| 51 | rank = 1 + rank_base
|
---|
| 52 |
|
---|
| 53 | for result in results:
|
---|
| 54 | id = result[u'id'];
|
---|
| 55 | name = result[u'name'];
|
---|
| 56 | artist_id = result[u'artist_id'];
|
---|
| 57 |
|
---|
| 58 | track_id_to_artist_id[id] = artist_id
|
---|
| 59 |
|
---|
| 60 | if (verbosity>=2):
|
---|
| 61 | print "genre: " + genre + ", id = " + id + ", name = " + result[u'name']
|
---|
| 62 |
|
---|
| 63 | if evened_out_by_genre_distrib.get(id) == None:
|
---|
| 64 | evened_out_by_genre_distrib[genre] = {}
|
---|
| 65 | evened_out_by_genre_distrib[genre][id] = True;
|
---|
| 66 |
|
---|
| 67 | if evened_out_by_id_distrib.get(id) == None:
|
---|
| 68 | evened_out_by_id_distrib[id] = []
|
---|
| 69 | evened_out_by_id_distrib[id].append(genre)
|
---|
| 70 |
|
---|
| 71 | if evened_out_by_rank_distrib.get(rank) == None:
|
---|
| 72 | evened_out_by_rank_distrib[rank] = []
|
---|
| 73 | evened_out_by_rank_distrib[rank].append(id)
|
---|
| 74 |
|
---|
| 75 | rank = rank + 1
|
---|
| 76 |
|
---|
| 77 | def evened_out_by_rank():
|
---|
| 78 | for rank_id in evened_out_by_rank_distrib:
|
---|
| 79 | rank_id_list = evened_out_by_rank_distrib[rank_id]
|
---|
| 80 | print "Rank: " + str(rank_id)
|
---|
| 81 | for track_id in rank_id_list:
|
---|
| 82 | artist_id = track_id_to_artist_id[track_id]
|
---|
| 83 |
|
---|
| 84 | if evened_out_artist_limit.get(artist_id) == None:
|
---|
| 85 | evened_out_artist_limit[artist_id] = 0
|
---|
| 86 |
|
---|
| 87 | evened_out_artist_limit[artist_id] = evened_out_artist_limit[artist_id] + 1
|
---|
| 88 | if (evened_out_chosen_track.get(track_id) == None):
|
---|
| 89 |
|
---|
| 90 | if (evened_out_artist_limit[artist_id] < artist_limit):
|
---|
| 91 | print " Adding: " + track_id
|
---|
| 92 | evened_out_unique_tracks.append(track_id)
|
---|
| 93 | evened_out_chosen_track[track_id] = True
|
---|
| 94 | else:
|
---|
| 95 | if verbosity >= 2:
|
---|
| 96 | print " Skipping track id due to artist limit: " + track_id
|
---|
| 97 | else:
|
---|
| 98 | if verbosity >= 2:
|
---|
| 99 | print " Skipping as track already chosen: " + track_id
|
---|
| 100 |
|
---|
| 101 | if len(evened_out_unique_tracks) == dataset_size:
|
---|
| 102 | print "Reached dataset size!"
|
---|
| 103 | break
|
---|
| 104 |
|
---|
| 105 | if len(evened_out_unique_tracks) == dataset_size:
|
---|
| 106 | break
|
---|
| 107 |
|
---|
| 108 | def boosted_by_genre(genre,curr_freq_level):
|
---|
| 109 |
|
---|
| 110 | for i in xrange(0, curr_freq_level, jamendo.chunk_size):
|
---|
| 111 | curr_chunk_size = curr_freq_level % jamendo.chunk_size if (i +jamendo.chunk_size)>curr_freq_level else jamendo.chunk_size
|
---|
| 112 |
|
---|
| 113 | json_data = jamendo.download_from_jamendo(i,curr_chunk_size,
|
---|
| 114 | "&tags="+genre+"&boost=popularity_total")
|
---|
| 115 | process_boosted_genre(genre,i,json_data)
|
---|
| 116 |
|
---|
| 117 |
|
---|
| 118 |
|
---|
| 119 | def boosted_by_all_genres_descending(genre_freq_sorted,freq_levels):
|
---|
| 120 |
|
---|
| 121 | curr_freq_level = freq_levels.pop(0);
|
---|
| 122 |
|
---|
| 123 | break_count = 0
|
---|
| 124 |
|
---|
| 125 | for genrec in genre_freq_sorted:
|
---|
| 126 |
|
---|
| 127 | # if break_count==2:
|
---|
| 128 | # break
|
---|
| 129 |
|
---|
| 130 | genre = genrec[0]
|
---|
| 131 | freq = int(genrec[1])
|
---|
| 132 |
|
---|
| 133 | ## while freq<2*curr_freq_level:
|
---|
| 134 | while freq<curr_freq_level:
|
---|
| 135 | # under current limit
|
---|
| 136 | # => move on to next lower limit
|
---|
| 137 | # (but stop if freq_levels becomes empty)
|
---|
| 138 |
|
---|
| 139 | if (len(freq_levels)==0):
|
---|
| 140 | break
|
---|
| 141 |
|
---|
| 142 | curr_freq_level = freq_levels.pop(0);
|
---|
| 143 |
|
---|
| 144 | if freq>=curr_freq_level:
|
---|
| 145 |
|
---|
| 146 | # still in comfortable zone for selecting
|
---|
| 147 | print "boost by genre " + genre + ": get "+ str(curr_freq_level) + " from " + str(freq) + " entries"
|
---|
| 148 | boosted_by_genre(genre,curr_freq_level)
|
---|
| 149 |
|
---|
| 150 | break_count = break_count + 1
|
---|
| 151 |
|
---|
| 152 | print "*******"
|
---|
| 153 | print "* Number of tracks to choose from: " + str(len(evened_out_by_id_distrib))
|
---|
| 154 | print "*******"
|
---|
| 155 |
|
---|
| 156 |
|
---|
| 157 | argc = len(sys.argv)
|
---|
| 158 |
|
---|
| 159 | input_file = sys.argv[1] if argc==2 else join("csv-data","genre-tags-freq.csv")
|
---|
| 160 |
|
---|
| 161 | genre_freq_sorted = genre_sorted_by_freq(input_file,order='descending')
|
---|
| 162 | #genre_freq_sorted = genre_sorted_by_freq(input_file,order='ascending')
|
---|
| 163 |
|
---|
| 164 | #for row in genre_freq_sorted:
|
---|
| 165 | # print(','.join(row))
|
---|
| 166 |
|
---|
| 167 |
|
---|
| 168 | freq_levels = [1000, 800, 600, 400, 300, 200, 100, 50]
|
---|
| 169 |
|
---|
| 170 | #freq_levels = [400, 200, 100, 10]
|
---|
| 171 | #freq_levels = [10, 200, 400]
|
---|
| 172 |
|
---|
| 173 | boosted_by_all_genres_descending(genre_freq_sorted,freq_levels)
|
---|
| 174 |
|
---|
| 175 |
|
---|
| 176 | evened_out_by_rank()
|
---|
| 177 |
|
---|
| 178 | with open("jamendo-evened-out-dataset-trackids.json", "w") as json_ofile:
|
---|
| 179 | json.dump(evened_out_unique_tracks,json_ofile)
|
---|