source: other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/generate-evened-out-genre-csv.py@ 30005

Last change on this file since 30005 was 30005, checked in by davidb, 9 years ago

Initial cut at scripts for generating a 'boosted' ggenre set of files

  • Property svn:executable set to *
File size: 2.7 KB
RevLine 
[30005]1#!/usr/bin/python
2
3from os import listdir
4from os.path import isfile, join
5
6import json
7import random
8import sys
9
10
11
12def tabulate_results():
13 print "===="
14 print "Number of records processed " + str(jid_count)
15 print "===="
16
17 ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w")
18
19 print "Generating ..."
20 print " Genre Tags Frequency Distribution"
21 print " (one track can be given assigned multiple genres)"
22
23 print >> ofd_freq, "Genre tag,Frequency count"
24
25 for gk in genre_table.keys():
26 print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk])
27
28 ofd_freq.close();
29
30 ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w")
31
32 print " Number of Genre Tags Assigned per Track"
33 print >> ofd_distrib, "Num of tags,Frequency count"
34 for gk in sorted(genre_table_tagcount.keys()):
35 print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk])
36
37 ofd_distrib.close();
38
39
40
41def processTagset(jid,tagrec):
42
43 # genres, instruments, vartags
44
45 genres = tagrec[u'genres']
46 num_genres = len(genres)
47
48 num_genres_str = str(num_genres)
49 if not genre_table_tagcount.has_key(num_genres_str):
50 genre_table_tagcount[num_genres_str] = 0
51 genre_table_tagcount[num_genres_str] += 1
52
53 for g in genres:
54 if not genre_table.has_key(g):
55 genre_table[g] = 0
56 genre_table[g] += 1
57
58
59
60argc = len(sys.argv)
61
62input_dir = sys.argv[1] if (argc==2) or (argc==3) else "download-json-all"
63output_dir = sys.argv[2] if argc==3 else "csv-data"
64
65json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ]
66
67jamendo_ids = [];
68
69genre_table = {}
70genre_table_tagcount = {}
71
72jid_count = 0
73
74evened_out_json_data=open('jamendo-evened-out-dataset-trackids.json').read()
75evened_out_jamendo_ids=json.loads(evened_out_json_data)
76
77evened_out_hashmap = {}
78for jid in evened_out_jamendo_ids:
79 evened_out_hashmap[jid] = True
80
81print "Evened out dataset size = " + str(len(evened_out_hashmap))
82
83all_hashmap = {}
84
85for jf in json_files:
86 print "Processing" + jf
87
88 json_filename = join(input_dir,jf)
89 ifd = open(json_filename)
90 json_data = ifd.read()
91 data = json.loads(json_data)
92
93# if jid_count>100:
94# break
95
96 for rec in data[u'results']:
97
98 jid = rec[u'id']
99 all_hashmap[jid] = True
100
101 jid_count += 1
102
103 musicinfo = rec[u'musicinfo']
104 tags = musicinfo[u'tags']
105 if (evened_out_hashmap.get(jid) != None):
106 processTagset(jid,tags)
107
108
109 ifd.close()
110
111
112tabulate_results()
113
114print "Checking for consitency!"
115
116for jid in evened_out_jamendo_ids:
117 if all_hashmap.get(jid) == None:
118 print "**** Not found in all jid map:" + str(jid)
119
120
121
122
123
124
Note: See TracBrowser for help on using the repository browser.