root/other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/generate-evened-out-genre-csv.py @ 30005

Revision 30005, 2.7 KB (checked in by davidb, 5 years ago)

Initial cut at scripts for generating a 'boosted' ggenre set of files

  • Property svn:executable set to *
Line 
1#!/usr/bin/python
2
3from os import listdir
4from os.path import isfile, join
5
6import json
7import random
8import sys
9
10
11
12def tabulate_results():
13    print "===="
14    print "Number of records processed " + str(jid_count)
15    print "===="
16
17    ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w")
18
19    print "Generating ..."
20    print "  Genre Tags Frequency Distribution"
21    print "  (one track can be given assigned multiple genres)"
22   
23    print >> ofd_freq, "Genre tag,Frequency count"
24
25    for gk in genre_table.keys():
26        print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk])
27
28    ofd_freq.close();
29
30    ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w")
31
32    print "  Number of Genre Tags Assigned per Track"
33    print >> ofd_distrib, "Num of tags,Frequency count"
34    for gk in sorted(genre_table_tagcount.keys()):
35        print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk])
36
37    ofd_distrib.close();
38
39
40
41def processTagset(jid,tagrec):
42
43    # genres, instruments, vartags
44
45    genres = tagrec[u'genres']
46    num_genres = len(genres)
47
48    num_genres_str = str(num_genres)
49    if not genre_table_tagcount.has_key(num_genres_str):
50        genre_table_tagcount[num_genres_str] = 0
51    genre_table_tagcount[num_genres_str] += 1
52
53    for g in genres:
54        if not genre_table.has_key(g):
55            genre_table[g] = 0
56        genre_table[g] += 1
57
58       
59
60argc = len(sys.argv)
61
62input_dir  = sys.argv[1] if (argc==2) or (argc==3)  else "download-json-all"
63output_dir = sys.argv[2] if argc==3 else "csv-data"
64
65json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ]
66
67jamendo_ids = [];
68
69genre_table = {}
70genre_table_tagcount = {}
71
72jid_count = 0
73
74evened_out_json_data=open('jamendo-evened-out-dataset-trackids.json').read()
75evened_out_jamendo_ids=json.loads(evened_out_json_data)
76
77evened_out_hashmap = {}
78for jid in evened_out_jamendo_ids:
79    evened_out_hashmap[jid] = True
80
81print "Evened out dataset size = " + str(len(evened_out_hashmap))
82
83all_hashmap = {}
84
85for jf in json_files:
86    print "Processing" + jf
87
88    json_filename = join(input_dir,jf)
89    ifd = open(json_filename)
90    json_data = ifd.read()
91    data = json.loads(json_data)
92
93#    if jid_count>100:
94#        break
95
96    for rec in data[u'results']:
97
98        jid = rec[u'id']
99        all_hashmap[jid] = True
100       
101        jid_count += 1
102 
103        musicinfo = rec[u'musicinfo']
104        tags = musicinfo[u'tags']
105        if (evened_out_hashmap.get(jid) != None):
106            processTagset(jid,tags)
107       
108
109    ifd.close()
110
111
112tabulate_results()
113
114print "Checking for consitency!"
115
116for jid in evened_out_jamendo_ids:
117    if all_hashmap.get(jid) == None:
118        print "**** Not found in all jid map:" + str(jid)
119
120
121
122
123
124
Note: See TracBrowser for help on using the browser.