Context Navigation

← Previous Changeset
Next Changeset →

Changeset 34910

Timestamp:

2021-02-22T23:38:30+13:00 (3 years ago)

Author:

davidb

Message:

Completion of initial work on supporting from-country voting to produce docs in the DL; written, then tested with building collection. This is the result after debugging

Location:

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel

Files:

: 3 edited

xlsx-fromcountry-jsonmetadata.py (modified) (9 diffs)
xlsx-tocountry-jsonmetadata.py (modified) (10 diffs)
xlsxutil.py (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel/xlsx-fromcountry-jsonmetadata.py

-              r34908
+              r34910
 import sys
 import json
-#import csv
 import argparse
 …
 #jury_from_countries = {}
 #tele_from_countries = {}
 …
-    # Filter down to just the voting results concerning finals
-    data_hashmap_array_finals = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
     # Sort so array entries are grouped by the country receiving the votes in a given year
     data_hashmap_array_finals.sort(key=xlsxutil.sortkey_year_from_country)
+    data_hashmap_array.sort(key=xlsxutil.sortkey_year_from_country)
     # Debug output
+    #
     # for data_hashmap in data_hashmap_array_finals:
+    # for data_hashmap in data_hashmap_array:
     #    print(data_hashmap.get('From country'), data_hashmap.get('Year'), data_hashmap.get('Points'), "(Points to " + data_hashmap.get('To country') + ")")
 …
     country_groups = []
     prev_data_hashmap = data_hashmap_array_finals[0]
+    prev_data_hashmap = data_hashmap_array[0]
     country_group = [ ]
     i = 1
     num_finals = len(data_hashmap_array_finals)
     while (i < num_finals):
+    num_rows = len(data_hashmap_array)
+    while (i < num_rows):
         country_group.append(prev_data_hashmap)
         data_hashmap = data_hashmap_array_finals[i]
+        data_hashmap = data_hashmap_array[i]
         if (data_hashmap.get('From country') != prev_data_hashmap.get('From country')):
 …
     country_groups.append(country_group)
+    # Debug output
+    return country_groups
+def debug_print_country_groups(country_groups):
     for from_country_voting_array in country_groups:
 …
         for to_country_entry in from_country_voting_array:
+            print("  ", to_country_entry.get('Year'), to_country_entry.get('Points'),
+                  " ("+to_country_entry.get('Jury or Televoting')+") ",
+                  " Points to " + to_country_entry.get('To country'))
+    return country_groups
+def fileset_voting_for_esc_country_in_year(to_country_year_votes):
+            print("  "+ str(to_country_entry.get('Year')) + str(to_country_entry.get('Points')).rjust(3)
+                  +" ("+to_country_entry.get('Jury or Televoting')+") "
+                  +" Points to " + to_country_entry.get('To country'))
+def gs_fileset_voting_by_esc_country_in_year(from_country_year_votes, nul_output_dir_name):
     # Looking to build data-structure (for output as JSON) in the form
     # { "FileSet":
     #   [
     #     { "FileName": "France1991\.nul" },
+    #     { "FileName": "VotesFrom-France1991-J\.nul" }, // The country that cast the votes (J=Jury, T=Tele)
     #     { "Description":
     #       {
     #         "Metadata":
     #           [
     #             { "name": "Germany-J", "content": 12 }, # J = Jury Vote
     #             { "name": "Germany-T", "content": 6 },  # T = Televote (if present)
+    #             { "name": "Germany", "content": 12 },
+    #             { "name": "Denmake", "content": 6 },
     #             ...
     #           ]
 …
+    # Scan all voting to build up complete list of all countries that
+    # either cast of received votes
+    all_from_countries_year = []
+    all_to_countries_tj     = []
+    for from_country_voting_array in country_groups:
+        from_country = from_country_voting_array[0].get('From country')
+        year = from_country_voting_array[0].get('Year')
+        ### Remove spaces!!!
+        from_country_year = from_country + "-" + year
+        all_from_countries_year[from_country_year] = 1
+        for to_country_entry in from_country_voting_array:
+            to_country = to_country_entry.get('To country')
+            vote_type = to_country_entry.get('Jury or Televoting')
+            to_country_vote_type = to_country+"-"+vote_type
+            all_to_countries[to_country_vote_type] = 1
+    from_country_to_hashmap = []
+    for to_country_entry in from_country_voting_array:
+        print("  ", to_country_entry.get('Year'), to_country_entry.get('Points'),
+                  " ("+to_country_entry.get('Jury or Televoting')+") ",
+                  " Points to " + to_country_entry.get('To country'))
+    csv_header_array = sorted(all_to_countries_tj.keys())
+    csv_ofile = open(csv_filename, 'wb')
+    csv_wr  = csv.writer(csv_ofile, quoting=csv.QUOTE_ALL)
+#            csv_wr.writerow(filtered_utf8_row)
+    csv_ofile.close()
+    fileset_array = []
+    # Debug output
+#    for to_country_entry in from_country_year_votes:
+#       print("  ", to_country_entry.get('Year'), to_country_entry.get('Points'),
+#                 " ("+to_country_entry.get('Jury or Televoting')+") ",
+#                 " Points to " + to_country_entry.get('To country'))
     metadata_array = []
+    jury_metadata_vals = []
+    tele_metadata_vals = []
+    to_country_jury_total = 0
+    to_country_tele_total = 0
+    for to_country_year_vote in to_country_year_votes:
+        to_country   = to_country_year_vote.get('To country')
+        year         = to_country_year_vote.get('Year')
+        from_country = to_country_year_vote.get('From country')
+        vote_type    = to_country_year_vote.get('Jury or Televoting')
+        points       = to_country_year_vote.get('Points')
+        id_from_country = re.sub(r'\s+', '', from_country)
+        voting_rec = { "name": id_from_country+"-"+vote_type, "content": points }
+    metadata_country_vals = []
+    metadata_points_vals  = []
+    from_country_total = 0
+    # Use the first record to be a representative for 'top level' (tl)
+    # metadata about the voting 'From country'
+    tl_rec = from_country_year_votes[0]
+    tl_from_country = tl_rec.get('From country')
+    tl_year         = tl_rec.get('Year')
+    tl_vote_type    = tl_rec.get('Jury or Televoting')
+    tl_from_country_id = "FromCountry-" + re.sub(r'\s+', '', tl_from_country) + str(tl_year) + "-" + tl_vote_type
+    metadata_array.append({"name": "Identifier",      "content" : tl_from_country_id})
+    metadata_array.append({"name": "FromCountry",     "content" : tl_from_country})
+    metadata_array.append({"name": "FromCountryYear", "content" : tl_year})
+    metadata_array.append({"name": "FromCountryType", "content" : tl_vote_type})
+    for from_country_year_vote in from_country_year_votes:
+        to_country   = from_country_year_vote.get('To country')
+        year         = from_country_year_vote.get('Year')
+        from_country = from_country_year_vote.get('From country')
+        vote_type    = from_country_year_vote.get('Jury or Televoting')
+        points       = from_country_year_vote.get('Points')
+        to_country_year_id = re.sub(r'\s+', '', to_country) + str(year)
+        voting_rec = { "name": to_country_year_id, "content": points }
         metadata_array.append(voting_rec)
+        if (vote_type == "J"):
+            jury_metadata_vals.append(id_from_country+"-J")
+            jury_from_countries[id_from_country] = 1
+            to_country_jury_total = to_country_jury_total + points
+        elif (vote_type == "T"):
+            tele_metadata_vals.append(id_from_country+"-T")
+            tele_from_countries[id_from_country] = 1
+            to_country_tele_total = to_country_tele_total + points
+        else:
+            util.eprint("Warning: Unrecognized voting type: " + vote_type)
+    if (len(jury_metadata_vals)>0):
+        metadata_array.append({ "name": "JuryVotesJSON", "content": json.dumps(jury_metadata_vals) })
+        metadata_array.append({ "name": "JuryVotesTotal", "content": to_country_jury_total})
+    if (len(tele_metadata_vals)>0):
+        metadata_array.append({ "name": "TeleVotesJSON", "content": json.dumps(tele_metadata_vals) })
+        metadata_array.append({ "name": "TeleVotesTotal", "content": to_country_tele_total})
+    id_to_country = to_country_year_votes[0].get('To country')
+    id_to_country = re.sub(r'\s+', '', id_to_country)
+    id_year = to_country_year_votes[0].get('Year');
+    id = id_to_country + str(id_year);
+<    filename_id = id + "\\.nul"
+        metadata_country_vals.append(to_country)
+        metadata_points_vals.append(points)
+        from_country_total = from_country_total + points
+    if (len(metadata_country_vals)>0):
+        metadata_array.append({ "name": "VotesCountryJSON-"+tl_vote_type,  "content": json.dumps(metadata_country_vals) })
+        metadata_array.append({ "name": "VotesPointsJSON-"+tl_vote_type,  "content": json.dumps(metadata_points_vals) })
+        # metadata_array.append({ "name": "VotesTotal-"+tl_vote_type, "content": from_country_total})
+    # id encodes from-country, year, and vote_type
+    filename_id = tl_from_country_id + "\\.nul"
+    nul_filename = os.path.join(nul_output_dir_name,tl_from_country_id+".nul");
+    print("Creating: " + nul_filename)
+    with open(nul_filename, 'w') as outfile:
+        outfile.write("")
     fileset = {
         "FileSet" : [
 …
     return fileset
+def gs_directory_metadata(from_country_year_voting_groups):
+    # Express the grouped from-country voting data
+    # in the Greenstone JSON metadata format:
+    # { "DirectoryMetadata":
+    #   [
+    #     { "FileSet":
+    #       [
+    #         { "FileName": "FromCountry-France1991-J\.nul" },
+    #         { "Description":
+    #           {
+    #             "Metadata":
+    #              [
+    #                { "name": "Germany", "content": "12" },
+    #                  ...
+    #              ]
+    #           }
+    #         }
+    #       ]
+    #     }
+    #     ...
+    #    ]
+    #  }
+    nul_output_dir_name = os.path.dirname(json_output_filename)
+    directory_metadata = []
+    for from_country_year_votes in from_country_year_voting_groups:
+        fileset = gs_fileset_voting_by_esc_country_in_year(from_country_year_votes, nul_output_dir_name)
+        directory_metadata.append(fileset)
+        filename_id = fileset.get('FileSet')[0].get('FileName')
+        num_countries_voting_data = len(fileset.get('FileSet')[1].get('Description').get('Metadata'))
+        print("  " + filename_id.ljust(28) + ": " + str(num_countries_voting_data) + " votes")
+    greenstone_metadata_json = { "DirectoryMetadata": directory_metadata }
+    return greenstone_metadata_json
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
     parser.add_argument('--voting-type', choices=['J','T'], help="Filter to only J=Jury or T=Tele cast votes")
+    parser.add_argument('--votingtype', choices=['J','T'], help="Filter to only J=Jury or T=Tele cast votes")
     parser.add_argument('input-file.xlsx')
     parser.add_argument('output-file.csv', nargs='?')
+    parser.add_argument('output-file.json', nargs='?')
     args = parser.parse_args()
     sheetname = getattr(args,'sheetname');
+    if (csv_output_filename == None):
+        csv_output_filename = os.path.splitext(excel_input_filename)[0]+'.csv'
+    voting_type = getattr(args,'voting-type');
+    voting_type = getattr(args,'votingtype');
     excel_input_filename = getattr(args,'input-file.xlsx');
+    csv_output_filename = getattr(args,'output-file.csv');
+    json_output_filename = getattr(args,'output-file.json');
+    if (json_output_filename == None):
+        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
 …
     data_hashmap_array = xlsxutil.convert_worksheet_to_hashmaps(worksheet)
     print("Number of data rows in Excel file: " + str(len(data_hashmap_array)))
+    print("Number of data rows in Excel file:   " + str(len(data_hashmap_array)))
     # Filter down to just the voting results concerning finals
     data_hashmap_array_finals = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
     print("Number of finalist voting data rows: " + str(len(data_hashmap_array_finals)))
+    data_hashmap_array_filtered = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
+    print("Number of finalist voting data rows: " + str(len(data_hashmap_array_filtered)))
     if voting_type != None:
         # Further filter down by the type of voting results cast
         if (voting_type == "J"):
             data_hashmap_array_finals = list(filter(xlsxutil.filter_jury_votes, data_hashmap_array_finals))
             print("Number Jury cast data rows: " + str(len(data_hashmap_array_finals)))
+            data_hashmap_array_filtered = list(filter(xlsxutil.filter_jury_votes, data_hashmap_array_filtered))
+            print("Number Jury cast data rows:          " + str(len(data_hashmap_array_filtered)))
         else:
             # Must be "T"
+            data_hashmap_array_finals = list(filter(xlsxutil.filter_tele_votes, data_hashmap_array_finals))
+            print("Number Jury cast data rows: " + str(len(data_hashmap_array_finals)))
+    from_country_year_voting_groups = create_from_country_voting_groups(data_hashmap_array_finals)
+            data_hashmap_array_filtered = list(filter(xlsxutil.filter_tele_votes, data_hashmap_array_filtered))
+            print("Number Televoting cast data rows:    " + str(len(data_hashmap_array_filtered)))
+    print()
+    from_country_year_voting_groups = create_from_country_voting_groups(data_hashmap_array_filtered)
+    debug_print_country_groups(from_country_year_voting_groups)
     # Debug output
+    #
+    #print(from_country_year_voting_groups)
+    # Next step is to express the grouped from-country voting data
+    # as a CSV file in the form used by Greenstone's CSVPlugin
+#    directory_metadata = []
+#    print("Creating Greenstone JSON voting metadata for:")
+#    for from_country_year_votes in from_country_year_voting_groups:
+#
+#        fileset = fileset_voting_for_esc_country_in_year(to_country_year_votes)
+#        directory_metadata.append(fileset)
+#
+#        filename_id = fileset.get('FileSet')[0].get('FileName')
+#        num_countries_voting_data = len(fileset.get('FileSet')[1].get('Description').get('Metadata'))
+#
+#        print("  " + filename_id.ljust(28) + ": " + str(num_countries_voting_data) + " votes")
+#    greenstone_metadata_json = { "DirectoryMetadata": directory_metadata }
+#
+#    with open(json_output_filename, 'w') as outfile:
+#        json.dump(greenstone_metadata_json, outfile, indent=2)
+    # print(from_country_year_voting_groups)
+    print()
+    print("Generating Greenstone JSON from-country voting metadata for:")
+    greenstone_metadata_json = gs_directory_metadata(from_country_year_voting_groups)
+    print("Saving output as: " + json_output_filename)
+    xlsxutil.save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename)
+    print()

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel/xlsx-tocountry-jsonmetadata.py

-              r34907
+              r34910
 tele_from_countries = {}
+#def fileset_voting_for_esc_country_in_year(data_hashmap):
+#
+#    # Only generating a single fileset record here, however
+#    # the Greenstone format allows for this to be an array
+#    # of fileset entries => return [ fileset ]
+#
+#    fileset = {}
+#
+#    return [ fileset ]
+#def filter_finalist_votes(entry):
+#    return entry.get('(semi-) final') == "f" and entry.get('Duplicate') == None
+#
+#def sortkey_year_to_country(entry):
+#    return str(entry.get('Year'))+entry.get('To country')
+def create_to_country_voting_groups(data_hashmap_array_finals):
+def create_to_country_voting_groups(data_hashmap_array):
     # Example values for header-names
 …
-#    # Filter down to just the voting results concerning finals
-#    data_hashmap_array_finals = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
     # Sort so array entries are grouped by the country receiving the votes in a given year
     data_hashmap_array_finals.sort(key=xlsxutil.sortkey_year_to_country)
+    data_hashmap_array.sort(key=xlsxutil.sortkey_year_to_country)
     # Debug output
+    #
     # for data_hashmap in data_hashmap_array_finals:
+    # for data_hashmap in data_hashmap_array:
     #    print(data_hashmap.get('To country'), data_hashmap.get('Year'), data_hashmap.get('Points'), "(Points from " + data_hashmap.get('From country') + ")")
 …
     country_groups = []
     prev_data_hashmap = data_hashmap_array_finals[0]
+    prev_data_hashmap = data_hashmap_array[0]
     country_group = [ ]
     i = 1
     num_finals = len(data_hashmap_array_finals)
     while (i < num_finals):
+    num_rows = len(data_hashmap_array)
+    while (i < num_rows):
         country_group.append(prev_data_hashmap)
         data_hashmap = data_hashmap_array_finals[i]
+        data_hashmap = data_hashmap_array[i]
         if (data_hashmap.get('To country') != prev_data_hashmap.get('To country')):
 …
     # }
-    fileset_array = []
     metadata_array = []
 …
     to_country_jury_total = 0
     to_country_tele_total = 0
+    # Use the first record to be a representative for 'top level' (tl)
+    # metadata about the votes cast 'To country'
+    tl_rec = to_country_year_votes[0]
+    tl_to_country = tl_rec.get('To country')
+    tl_year       = tl_rec.get('Year')
+    # tl_vote_type  = tl_rec.get('Jury or Televoting')
+    tl_to_country_id = re.sub(r'\s+', '', tl_to_country) + str(tl_year)
     for to_country_year_vote in to_country_year_votes:
 …
         metadata_array.append({ "name": "TeleVotesTotal", "content": to_country_tele_total})
+    id_to_country = to_country_year_votes[0].get('To country')
+    id_to_country = re.sub(r'\s+', '', id_to_country)
+    id_year = to_country_year_votes[0].get('Year');
+    id = id_to_country + str(id_year);
+    filename_id = id + "\\.nul"
+    # id_to_country = to_country_year_votes[0].get('To country')
+    # id_to_country = re.sub(r'\s+', '', id_to_country)
+    #id_year = to_country_year_votes[0].get('Year');
+    #id = id_to_country + str(id_year);
+    filename_id = tl_to_country_id + "\\.nul"
     fileset = {
 …
     return fileset
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input-file.xlsx')
+    parser.add_argument('output-file.json', nargs='?')
+    parser.add_argument('--sheetname')
+    args = parser.parse_args()
+    excel_input_filename = getattr(args,'input-file.xlsx');
+    json_output_filename = getattr(args,'output-file.json');
+    sheetname = getattr(args,'sheetname');
+    if (json_output_filename == None):
+        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
+    worksheet = xlsxutil.load_xslx_sheet(excel_input_filename,sheetname)
+    data_hashmap_array = xlsxutil.convert_worksheet_to_hashmaps(worksheet)
+    print("Number of data rows: " + str(len(data_hashmap_array)))
+    # Filter down to just the voting results concerning finals
+    data_hashmap_array_finals = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
+    print("Number of finalist voting data rows: " + str(len(data_hashmap_array_finals)))
+    to_country_year_voting_groups = create_to_country_voting_groups(data_hashmap_array_finals)
+    # Debug output
+    #
+    # print(to_country_year_voting_groups)
+def gs_directory_metadata(to_country_year_voting_groups):
     # Next step is to express the grouped to-country voting data
     # in the Greenstone JSON metadata format:
 …
     directory_metadata = []
-    print("Creating Greenstone JSON voting metadata for:")
     for to_country_year_votes in to_country_year_voting_groups:
 …
         print("  " + filename_id.ljust(28) + ": " + str(num_countries_voting_data) + " votes")
+    print("")
+    print("")
+    greenstone_metadata_json = { "DirectoryMetadata": directory_metadata }
+    return greenstone_metadata_json
+def display_gs_head_metadata_tags():
+    print()
+    print()
     print("For, e.g., '<display><format>' section of collectionConfig.xml:")
     print("")
+    print()
     print("  <gsf:headMetaTags>")
 …
         print("    <gsf:metadata name=\""+from_country+"-J\" />")
     print("")
+    print()
     for from_country in sorted(tele_from_countries.keys()):
         print("    <gsf:metadata name=\""+from_country+"-T\" />")
     print("  </gsf:headMetaTags>")
+    print("")
+    greenstone_metadata_json = { "DirectoryMetadata": directory_metadata }
+    with open(json_output_filename, 'w') as outfile:
+        json.dump(greenstone_metadata_json, outfile, indent=2)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
+    parser.add_argument('--votingtype', choices=['J','T'], help="Filter to only J=Jury or T=Tele cast votes")
+    parser.add_argument('input-file.xlsx')
+    parser.add_argument('output-file.json', nargs='?')
+    args = parser.parse_args()
+    sheetname = getattr(args,'sheetname');
+    voting_type = getattr(args,'votingtype');
+    excel_input_filename = getattr(args,'input-file.xlsx');
+    json_output_filename = getattr(args,'output-file.json');
+    if (json_output_filename == None):
+        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
+    worksheet = xlsxutil.load_xslx_sheet(excel_input_filename,sheetname)
+    data_hashmap_array = xlsxutil.convert_worksheet_to_hashmaps(worksheet)
+    print("Number of data rows in Excel file:   " + str(len(data_hashmap_array)))
+    # Filter down to just the voting results concerning finals
+    data_hashmap_array_filtered = list(filter(xlsxutil.filter_finalist_votes, data_hashmap_array))
+    print("Number of finalist voting data rows: " + str(len(data_hashmap_array_filtered)))
+    if voting_type != None:
+        # Further filter down by the type of voting results cast
+        if (voting_type == "J"):
+            data_hashmap_array_filtered = list(filter(xlsxutil.filter_jury_votes, data_hashmap_array_filtered))
+            print("Number Jury cast data rows:          " + str(len(data_hashmap_array_filtered)))
+        else:
+            # Must be "T"
+            data_hashmap_array_filtered = list(filter(xlsxutil.filter_tele_votes, data_hashmap_array_filtered))
+            print("Number Televoting cast data rows:    " + str(len(data_hashmap_array_filtered)))
+    to_country_year_voting_groups = create_to_country_voting_groups(data_hashmap_array_filtered)
+    # Debug output
+    #
+    # print(to_country_year_voting_groups)
+    print()
+    print("Generating Greenstone JSON to-country voting metadata received by:")
+    greenstone_metadata_json = gs_directory_metadata(to_country_year_voting_groups)
+    print("Saving output as: " + json_output_filename)
+    xlsxutil.save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename)
+    display_gs_head_metadata_tags()
+    print()

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel/xlsxutil.py

-              r34892
+              r34910
 import os
 import sys
+import json
 import openpyxl
 …
 def filter_finalist_votes(entry):
+    return entry.get('(semi-) final') == "f" and entry.get('Duplicate') == None
+    return entry.get("(semi-) final") == "f" and entry.get('Duplicate') == None
+def filter_jury_votes(entry):
+    return entry.get("Jury or Televoting") == "J" and entry.get('Duplicate') == None
+def filter_tele_votes(entry):
+    return entry.get("Jury or Televoting") == "T" and entry.get('Duplicate') == None
 def sortkey_year_to_country(entry):
 …
     print ""
     return data_hashmap_array
+def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename):
+    with open(json_output_filename, 'w') as outfile:
+        json.dump(greenstone_metadata_json, outfile, indent=2)

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: