[30005] | 1 | #!/usr/bin/python
|
---|
| 2 |
|
---|
| 3 | from os import listdir
|
---|
| 4 | from os.path import isfile, join
|
---|
| 5 |
|
---|
| 6 | import json
|
---|
| 7 | import random
|
---|
| 8 | import sys
|
---|
| 9 |
|
---|
| 10 |
|
---|
| 11 |
|
---|
| 12 | def tabulate_results():
|
---|
| 13 | print "===="
|
---|
| 14 | print "Number of records processed " + str(jid_count)
|
---|
| 15 | print "===="
|
---|
| 16 |
|
---|
| 17 | ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w")
|
---|
| 18 |
|
---|
| 19 | print "Generating ..."
|
---|
| 20 | print " Genre Tags Frequency Distribution"
|
---|
| 21 | print " (one track can be given assigned multiple genres)"
|
---|
| 22 |
|
---|
| 23 | print >> ofd_freq, "Genre tag,Frequency count"
|
---|
| 24 |
|
---|
| 25 | for gk in genre_table.keys():
|
---|
| 26 | print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk])
|
---|
| 27 |
|
---|
| 28 | ofd_freq.close();
|
---|
| 29 |
|
---|
| 30 | ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w")
|
---|
| 31 |
|
---|
| 32 | print " Number of Genre Tags Assigned per Track"
|
---|
| 33 | print >> ofd_distrib, "Num of tags,Frequency count"
|
---|
| 34 | for gk in sorted(genre_table_tagcount.keys()):
|
---|
| 35 | print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk])
|
---|
| 36 |
|
---|
| 37 | ofd_distrib.close();
|
---|
| 38 |
|
---|
| 39 |
|
---|
| 40 |
|
---|
| 41 | def processTagset(jid,tagrec):
|
---|
| 42 |
|
---|
| 43 | # genres, instruments, vartags
|
---|
| 44 |
|
---|
| 45 | genres = tagrec[u'genres']
|
---|
| 46 | num_genres = len(genres)
|
---|
| 47 |
|
---|
| 48 | num_genres_str = str(num_genres)
|
---|
| 49 | if not genre_table_tagcount.has_key(num_genres_str):
|
---|
| 50 | genre_table_tagcount[num_genres_str] = 0
|
---|
| 51 | genre_table_tagcount[num_genres_str] += 1
|
---|
| 52 |
|
---|
| 53 | for g in genres:
|
---|
| 54 | if not genre_table.has_key(g):
|
---|
| 55 | genre_table[g] = 0
|
---|
| 56 | genre_table[g] += 1
|
---|
| 57 |
|
---|
| 58 |
|
---|
| 59 |
|
---|
| 60 | argc = len(sys.argv)
|
---|
| 61 |
|
---|
| 62 | input_dir = sys.argv[1] if (argc==2) or (argc==3) else "download-json-all"
|
---|
| 63 | output_dir = sys.argv[2] if argc==3 else "csv-data"
|
---|
| 64 |
|
---|
| 65 | json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ]
|
---|
| 66 |
|
---|
| 67 | jamendo_ids = [];
|
---|
| 68 |
|
---|
| 69 | genre_table = {}
|
---|
| 70 | genre_table_tagcount = {}
|
---|
| 71 |
|
---|
| 72 | jid_count = 0
|
---|
| 73 |
|
---|
| 74 | evened_out_json_data=open('jamendo-evened-out-dataset-trackids.json').read()
|
---|
| 75 | evened_out_jamendo_ids=json.loads(evened_out_json_data)
|
---|
| 76 |
|
---|
| 77 | evened_out_hashmap = {}
|
---|
| 78 | for jid in evened_out_jamendo_ids:
|
---|
| 79 | evened_out_hashmap[jid] = True
|
---|
| 80 |
|
---|
| 81 | print "Evened out dataset size = " + str(len(evened_out_hashmap))
|
---|
| 82 |
|
---|
| 83 | all_hashmap = {}
|
---|
| 84 |
|
---|
| 85 | for jf in json_files:
|
---|
| 86 | print "Processing" + jf
|
---|
| 87 |
|
---|
| 88 | json_filename = join(input_dir,jf)
|
---|
| 89 | ifd = open(json_filename)
|
---|
| 90 | json_data = ifd.read()
|
---|
| 91 | data = json.loads(json_data)
|
---|
| 92 |
|
---|
| 93 | # if jid_count>100:
|
---|
| 94 | # break
|
---|
| 95 |
|
---|
| 96 | for rec in data[u'results']:
|
---|
| 97 |
|
---|
| 98 | jid = rec[u'id']
|
---|
| 99 | all_hashmap[jid] = True
|
---|
| 100 |
|
---|
| 101 | jid_count += 1
|
---|
| 102 |
|
---|
| 103 | musicinfo = rec[u'musicinfo']
|
---|
| 104 | tags = musicinfo[u'tags']
|
---|
| 105 | if (evened_out_hashmap.get(jid) != None):
|
---|
| 106 | processTagset(jid,tags)
|
---|
| 107 |
|
---|
| 108 |
|
---|
| 109 | ifd.close()
|
---|
| 110 |
|
---|
| 111 |
|
---|
| 112 | tabulate_results()
|
---|
| 113 |
|
---|
| 114 | print "Checking for consitency!"
|
---|
| 115 |
|
---|
| 116 | for jid in evened_out_jamendo_ids:
|
---|
| 117 | if all_hashmap.get(jid) == None:
|
---|
| 118 | print "**** Not found in all jid map:" + str(jid)
|
---|
| 119 |
|
---|
| 120 |
|
---|
| 121 |
|
---|
| 122 |
|
---|
| 123 |
|
---|
| 124 |
|
---|