#!/usr/bin/python import json import os from os.path import isfile, join import sys import csv import jamendo verbosity = 1 dataset_size = 10000 artist_limit = 20 track_id_to_artist_id = {} evened_out_by_genre_distrib = {} evened_out_by_id_distrib = {} evened_out_by_rank_distrib = {} evened_out_unique_tracks = [] evened_out_chosen_track = {} evened_out_artist_limit = {} def genre_sorted_by_freq(input_file,order='ascending'): with open(input_file) as csvfile: freq_reader = csv.reader(csvfile, delimiter=',', quotechar='"') # Remove first entry, which is text information freq_reader.next() freq_data = [] for row in freq_reader: genre=row[0].rstrip() freq=row[1] freq_data.append( (genre,freq)) if order == "descending": freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=True) else: freq_data_sorted = sorted(freq_data, key=lambda data: int(data[1]),reverse=False) return freq_data_sorted def process_boosted_genre(genre,rank_base,json_data): results = json_data[u'results'] rank = 1 + rank_base for result in results: id = result[u'id']; name = result[u'name']; artist_id = result[u'artist_id']; track_id_to_artist_id[id] = artist_id if (verbosity>=2): print "genre: " + genre + ", id = " + id + ", name = " + result[u'name'] if evened_out_by_genre_distrib.get(id) == None: evened_out_by_genre_distrib[genre] = {} evened_out_by_genre_distrib[genre][id] = True; if evened_out_by_id_distrib.get(id) == None: evened_out_by_id_distrib[id] = [] evened_out_by_id_distrib[id].append(genre) if evened_out_by_rank_distrib.get(rank) == None: evened_out_by_rank_distrib[rank] = [] evened_out_by_rank_distrib[rank].append(id) rank = rank + 1 def evened_out_by_rank(): for rank_id in evened_out_by_rank_distrib: rank_id_list = evened_out_by_rank_distrib[rank_id] print "Rank: " + str(rank_id) for track_id in rank_id_list: artist_id = track_id_to_artist_id[track_id] if evened_out_artist_limit.get(artist_id) == None: evened_out_artist_limit[artist_id] = 0 evened_out_artist_limit[artist_id] = evened_out_artist_limit[artist_id] + 1 if (evened_out_chosen_track.get(track_id) == None): if (evened_out_artist_limit[artist_id] < artist_limit): print " Adding: " + track_id evened_out_unique_tracks.append(track_id) evened_out_chosen_track[track_id] = True else: if verbosity >= 2: print " Skipping track id due to artist limit: " + track_id else: if verbosity >= 2: print " Skipping as track already chosen: " + track_id if len(evened_out_unique_tracks) == dataset_size: print "Reached dataset size!" break if len(evened_out_unique_tracks) == dataset_size: break def boosted_by_genre(genre,curr_freq_level): for i in xrange(0, curr_freq_level, jamendo.chunk_size): curr_chunk_size = curr_freq_level % jamendo.chunk_size if (i +jamendo.chunk_size)>curr_freq_level else jamendo.chunk_size json_data = jamendo.download_from_jamendo(i,curr_chunk_size, "&tags="+genre+"&boost=popularity_total") process_boosted_genre(genre,i,json_data) def boosted_by_all_genres_descending(genre_freq_sorted,freq_levels): curr_freq_level = freq_levels.pop(0); break_count = 0 for genrec in genre_freq_sorted: # if break_count==2: # break genre = genrec[0] freq = int(genrec[1]) ## while freq<2*curr_freq_level: while freq move on to next lower limit # (but stop if freq_levels becomes empty) if (len(freq_levels)==0): break curr_freq_level = freq_levels.pop(0); if freq>=curr_freq_level: # still in comfortable zone for selecting print "boost by genre " + genre + ": get "+ str(curr_freq_level) + " from " + str(freq) + " entries" boosted_by_genre(genre,curr_freq_level) break_count = break_count + 1 print "*******" print "* Number of tracks to choose from: " + str(len(evened_out_by_id_distrib)) print "*******" argc = len(sys.argv) input_file = sys.argv[1] if argc==2 else join("csv-data","genre-tags-freq.csv") genre_freq_sorted = genre_sorted_by_freq(input_file,order='descending') #genre_freq_sorted = genre_sorted_by_freq(input_file,order='ascending') #for row in genre_freq_sorted: # print(','.join(row)) freq_levels = [1000, 800, 600, 400, 300, 200, 100, 50] #freq_levels = [400, 200, 100, 10] #freq_levels = [10, 200, 400] boosted_by_all_genres_descending(genre_freq_sorted,freq_levels) evened_out_by_rank() with open("jamendo-evened-out-dataset-trackids.json", "w") as json_ofile: json.dump(evened_out_unique_tracks,json_ofile)