1 | #!/usr/bin/python
|
---|
2 |
|
---|
3 | from os import listdir
|
---|
4 | from os.path import isfile, join
|
---|
5 |
|
---|
6 | import json
|
---|
7 | import random
|
---|
8 | import sys
|
---|
9 |
|
---|
10 |
|
---|
11 |
|
---|
12 | def tabulate_results():
|
---|
13 | print "===="
|
---|
14 | print "Number of records processed " + str(jid_count)
|
---|
15 | print "===="
|
---|
16 |
|
---|
17 | ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w")
|
---|
18 |
|
---|
19 | print "Generating ..."
|
---|
20 | print " Genre Tags Frequency Distribution"
|
---|
21 | print " (one track can be given assigned multiple genres)"
|
---|
22 |
|
---|
23 | print >> ofd_freq, "Genre tag,Frequency count"
|
---|
24 |
|
---|
25 | for gk in genre_table.keys():
|
---|
26 | print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk])
|
---|
27 |
|
---|
28 | ofd_freq.close();
|
---|
29 |
|
---|
30 | ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w")
|
---|
31 |
|
---|
32 | print " Number of Genre Tags Assigned per Track"
|
---|
33 | print >> ofd_distrib, "Num of tags,Frequency count"
|
---|
34 | for gk in sorted(genre_table_tagcount.keys()):
|
---|
35 | print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk])
|
---|
36 |
|
---|
37 | ofd_distrib.close();
|
---|
38 |
|
---|
39 |
|
---|
40 |
|
---|
41 | def processTagset(jid,tagrec):
|
---|
42 |
|
---|
43 | # genres, instruments, vartags
|
---|
44 |
|
---|
45 | genres = tagrec[u'genres']
|
---|
46 | num_genres = len(genres)
|
---|
47 |
|
---|
48 | num_genres_str = str(num_genres)
|
---|
49 | if not genre_table_tagcount.has_key(num_genres_str):
|
---|
50 | genre_table_tagcount[num_genres_str] = 0
|
---|
51 | genre_table_tagcount[num_genres_str] += 1
|
---|
52 |
|
---|
53 | for g in genres:
|
---|
54 | if not genre_table.has_key(g):
|
---|
55 | genre_table[g] = 0
|
---|
56 | genre_table[g] += 1
|
---|
57 |
|
---|
58 |
|
---|
59 |
|
---|
60 | argc = len(sys.argv)
|
---|
61 |
|
---|
62 | input_dir = sys.argv[1] if (argc==2) or (argc==3) else "download-json-all"
|
---|
63 | output_dir = sys.argv[2] if argc==3 else "csv-data"
|
---|
64 |
|
---|
65 | json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ]
|
---|
66 |
|
---|
67 | jamendo_ids = [];
|
---|
68 |
|
---|
69 | genre_table = {}
|
---|
70 | genre_table_tagcount = {}
|
---|
71 |
|
---|
72 | jid_count = 0
|
---|
73 |
|
---|
74 | evened_out_json_data=open('jamendo-evened-out-10000-dataset-trackids.json').read()
|
---|
75 | evened_out_jamendo_ids=json.loads(evened_out_json_data)
|
---|
76 |
|
---|
77 | evened_out_hashmap = {}
|
---|
78 | for jid in evened_out_jamendo_ids:
|
---|
79 | evened_out_hashmap[jid] = True
|
---|
80 |
|
---|
81 | print "Evened out dataset size = " + str(len(evened_out_hashmap))
|
---|
82 |
|
---|
83 | all_hashmap = {}
|
---|
84 |
|
---|
85 | for jf in json_files:
|
---|
86 | print "Processing" + jf
|
---|
87 |
|
---|
88 | json_filename = join(input_dir,jf)
|
---|
89 | ifd = open(json_filename)
|
---|
90 | json_data = ifd.read()
|
---|
91 | data = json.loads(json_data)
|
---|
92 |
|
---|
93 | # if jid_count>100:
|
---|
94 | # break
|
---|
95 |
|
---|
96 | for rec in data[u'results']:
|
---|
97 |
|
---|
98 | jid = rec[u'id']
|
---|
99 | all_hashmap[jid] = True
|
---|
100 |
|
---|
101 | jid_count += 1
|
---|
102 |
|
---|
103 | musicinfo = rec[u'musicinfo']
|
---|
104 | tags = musicinfo[u'tags']
|
---|
105 | if (evened_out_hashmap.get(jid) != None):
|
---|
106 | processTagset(jid,tags)
|
---|
107 |
|
---|
108 |
|
---|
109 | ifd.close()
|
---|
110 |
|
---|
111 |
|
---|
112 | tabulate_results()
|
---|
113 |
|
---|
114 | print "Checking for consitency!"
|
---|
115 |
|
---|
116 | for jid in evened_out_jamendo_ids:
|
---|
117 | if all_hashmap.get(jid) == None:
|
---|
118 | print "**** Not found in all jid map:" + str(jid)
|
---|
119 |
|
---|
120 |
|
---|
121 |
|
---|
122 |
|
---|
123 |
|
---|
124 |
|
---|