Changeset 34861


Ignore:
Timestamp:
2021-02-15T16:37:20+13:00 (3 years ago)
Author:
davidb
Message:

General code moved to module

Location:
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel/xlsx-to-jsonmetadata.py

    r34860 r34861  
    66import argparse
    77
    8 # import xlrd
    98import openpyxl
    109
    11 #import csv
    12 
    13 #import xlsutil
    14 
    15 #filter_headers = [
    16 #    u"Heading 1", u"Heading 2"
    17 #    # ... and so on
    18 #]
    19 
    20 filter_headers = None
    21 
    22 def load_xslx_sheet(excel_input_filename,sheetname):
    23     print "Loading: " + excel_input_filename
    24     print "  ..."
    25     workbook = openpyxl.load_workbook(excel_input_filename, data_only=True)
    26     print "  ......Done"
    27 
    28     print "  Loaded worksheets: ",  workbook.sheetnames
    29 
    30     worksheet = None
    31    
    32     if (sheetname == None):
    33         print "  Selecting default/active worksheet"
    34         worksheet = workbook.active
    35     else:
    36         print "  Selecting worksheet '"+sheetname+"'"
    37         worksheet = workbook[sheetname]
    38 
    39     print ""
    40     return worksheet
    41 
    42 
    43 def header_to_column_mapping(worksheet,header_row_pos):
    44 
    45     # For header line, build up hashmap of header-names to column number
    46 
    47     header_names_mapping = {}
    48 
    49     min_col = worksheet.min_column
    50     max_col = worksheet.max_column
    51    
    52     found_header = False
    53     for col_j in range(min_col,max_col+1):
    54         header_cell = worksheet.cell(row=header_row_pos, column=col_j)
    55         header_cell_value = header_cell.value.strip()
    56        
    57         if header_cell_value:
    58             header_names_mapping[header_cell_value] = col_j
    59 
    60     return header_names_mapping
    61 
    62 
    63 def convert_worksheet_to_hashmaps(worksheet):
    64 
    65     print "Converting worksheet to array header-name keyed array of hashmaps"
    66    
    67     data_hashmap_array = [];
    68 
    69     # scan through rows until non-empty/headernames encountered
    70 
    71     header_row_pos = None;
    72    
    73     row_i = 1
    74     for row in worksheet.values:
    75         for value in row:
    76             if (value != None) :
    77                 header_row_pos = row_i
    78                 break
    79        
    80             print(value)
    81         if (header_row_pos != None): break
    82         row_i = row_i + 1
    83      
    84     print "  Found headername data in row: ", header_row_pos
    85        
    86     header_names_mapping = header_to_column_mapping(worksheet,header_row_pos)
    87     header_names = header_names_mapping.keys()
    88 
    89     # Move on to next row, to start processing the data values
    90     row_i = row_i + 1
    91     num_rows = worksheet.max_row;
    92    
    93     while row_i<=num_rows:
    94         # Work through the provided header names, building up hashmap of header-name to value
    95         data_hashmap = {}
    96        
    97         for header_name in header_names:
    98 
    99             header_col_j = header_names_mapping[header_name]
    100 
    101             data_cell = worksheet.cell(row=row_i,column=header_col_j)
    102             data_value = data_cell.value
    103 
    104             data_hashmap[header_name] = data_value
    105 
    106         row_i = row_i + 1
    107        
    108         data_hashmap_array.append(data_hashmap)
    109 
    110     print ""
    111     return data_hashmap_array
     10import xlsxutil
     11
    11212
    11313def fileset_voting_for_esc_country_in_year(data_hashmap):
     
    247147        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
    248148       
    249     worksheet = load_xslx_sheet(excel_input_filename,sheetname)
    250 
    251     data_hashmap_array = convert_worksheet_to_hashmaps(worksheet)
     149    worksheet = xlsxutil.load_xslx_sheet(excel_input_filename,sheetname)
     150
     151    data_hashmap_array = xlsxutil.convert_worksheet_to_hashmaps(worksheet)
    252152
    253153    print "Number of data rows: ", len(data_hashmap_array)
    254154
     155   
    255156    to_country_year_voting_groups = create_to_country_voting_groups(data_hashmap_array)
    256157
     
    286187    ### TODO
    287188
    288     ## fix up names to match those used in DBpedia ESC
    289189    ## provide metadata value that lists the countries-year-type to voted
    290190
    291     # F.Y.R.Macedonia2012
    292     # Serbia&Montenegro2004.id    : 35 votes
    293     # Bosnia&Herzegovina2005.id   : 38 votes
    294     # TheNetherlands
    295    
    296 
    297     # From ESC:
    298     #   Republic of Macedonia (2005)
    299     #   Netherlands (not The Netherlands)
    300     #   Federal Republic of Yugoslavia (1992)
    301    
    302    
    303191    print "Creating Greenstone JSON voting metadata for:"
    304192    for to_country_year_votes in to_country_year_voting_groups:
Note: See TracChangeset for help on using the changeset viewer.