#!/usr/bin/python from os import listdir from os.path import isfile, join import json import random import sys def tabulate_results(): print "====" print "Number of records processed " + str(jid_count) print "====" ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w") print "Generating ..." print " Genre Tags Frequency Distribution" print " (one track can be given assigned multiple genres)" print >> ofd_freq, "Genre tag,Frequency count" for gk in genre_table.keys(): print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk]) ofd_freq.close(); ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w") print " Number of Genre Tags Assigned per Track" print >> ofd_distrib, "Num of tags,Frequency count" for gk in sorted(genre_table_tagcount.keys()): print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk]) ofd_distrib.close(); def processTagset(jid,tagrec): # genres, instruments, vartags genres = tagrec[u'genres'] num_genres = len(genres) num_genres_str = str(num_genres) if not genre_table_tagcount.has_key(num_genres_str): genre_table_tagcount[num_genres_str] = 0 genre_table_tagcount[num_genres_str] += 1 for g in genres: if not genre_table.has_key(g): genre_table[g] = 0 genre_table[g] += 1 argc = len(sys.argv) input_dir = sys.argv[1] if (argc==2) or (argc==3) else "download-json-all" output_dir = sys.argv[2] if argc==3 else "csv-data" json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ] jamendo_ids = []; genre_table = {} genre_table_tagcount = {} jid_count = 0 evened_out_json_data=open('jamendo-evened-out-dataset-trackids.json').read() evened_out_jamendo_ids=json.loads(evened_out_json_data) evened_out_hashmap = {} for jid in evened_out_jamendo_ids: evened_out_hashmap[jid] = True print "Evened out dataset size = " + str(len(evened_out_hashmap)) all_hashmap = {} for jf in json_files: print "Processing" + jf json_filename = join(input_dir,jf) ifd = open(json_filename) json_data = ifd.read() data = json.loads(json_data) # if jid_count>100: # break for rec in data[u'results']: jid = rec[u'id'] all_hashmap[jid] = True jid_count += 1 musicinfo = rec[u'musicinfo'] tags = musicinfo[u'tags'] if (evened_out_hashmap.get(jid) != None): processTagset(jid,tags) ifd.close() tabulate_results() print "Checking for consitency!" for jid in evened_out_jamendo_ids: if all_hashmap.get(jid) == None: print "**** Not found in all jid map:" + str(jid)