Changeset 33222 for main


Ignore:
Timestamp:
2019-06-26T15:52:27+12:00 (5 years ago)
Author:
davidb
Message:

More careful working with unicode/utf8

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33204 r33222  
    124124##        for header_name in xlsutil.sanitized_headers:
    125125
    126             header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j)
    127 
    128             if header_name and header_name.strip():
     126            header_name = xlsutil.cell_value_tidy_unicode(worksheet,xlsutil.header_row_pos,col_j)
     127
     128            if header_name:
    129129
    130130                header_col_j = header_names_mapping[header_name]
     
    132132##                header_col_j = header_names_mapping[header_name]
    133133
    134 ##                header_name = worksheet.cell_value(row_i,header_col_j)
     134##                header_name = xslutil.cell_value_tidy_unicode(worksheet,row_i,header_col_j)
    135135               
    136                 if header_name == "Report Identifier":
    137                     header_name = "hnz.Identifier"
    138 
    139                 if header_name == "Title":
    140                     header_name = "dc.Title"
    141 
    142                 if header_name == "Relevant TLA's":
    143                     header_name = "TLA"
    144 
    145                 header_name_utf8 = unicode(header_name).encode("utf-8")
     136                if header_name == u"Report Identifier":
     137                    header_name = u"hnz.Identifier"
     138
     139                if header_name == u"Title":
     140                    header_name = u"dc.Title"
     141
     142                if header_name == u"Relevant TLA's":
     143                    header_name = u"TLA"
     144
     145#                header_name_utf8 = unicode(header_name).encode("utf-8")
     146                header_name_utf8 = header_name.encode("utf-8")
    146147                metadata_name_utf8 = header_name_utf8.replace(" ", "")
    147148
    148149                entry_utf8_row.append(metadata_name_utf8)
    149150
    150                 if header_name == "Site No":
    151                     entry_utf8_row.append("SiteNoOrdering")
     151                if header_name == u"Site No":
     152                    entry_utf8_row.append(u"SiteNoOrdering")
    152153
    153154##            else:
     
    171172            for col_j in range(num_cols):
    172173##            for header_name in xlsutil.sanitized_headers:
    173                 header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j)
    174 
    175                 if header_name and header_name.strip():
     174                header_name = xlsutil.cell_value_tidy_unicode(worksheet,xlsutil.header_row_pos,col_j)
     175
     176                if header_name:
    176177                    col_j = header_names_mapping[header_name]
    177178##                if header_name in header_names_mapping:
    178179##                    col_j = header_names_mapping[header_name]
    179180
    180                     cell_value = worksheet.cell_value(row_i,col_j)
    181 
    182                     formatted_cell_value = xlsutil.format_if_int(cell_value)
    183 
    184                     if header_name == "Report Identifier":
     181                    formatted_cell_value = xlsutil.cell_value_tidy_unicode(worksheet,row_i,col_j)
     182
     183                    if header_name == u"Report Identifier":
    185184                        # Check to see if companion PDF file present
    186185                        # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell_value+".pdf")
     
    208207                        formatted_cell_value = formatted_cell_value.rstrip();
    209208
    210                     formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8")
     209#                    formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8")
     210                    formatted_cell_value_utf8 = formatted_cell_value.encode("utf-8")
    211211
    212212                    ## Perform any cell transformations to make DL used spreadsheet
     
    216216                    formatted_utf8_row.append(formatted_cell_value_utf8)
    217217
    218                     if header_name == "Site No":
     218                    if header_name == u"Site No":
    219219                        site_no = formatted_cell_value_utf8
    220220                        site_no_ordering = None
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33202 r33222  
    4949                header_col_j = header_names_mapping[header_name]
    5050
    51                 cell_value = worksheet.cell_value(row_i,header_col_j)
     51#                cell_value = worksheet.cell_value(row_i,header_col_j)
     52#
     53#                cell_value_utf8 = unicode(cell_value).encode("utf-8")
    5254
    53                 cell_value_utf8 = unicode(cell_value).encode("utf-8")
     55                cell_value_unicode = xlsutil.cell_value_tidy_unicode(worksheet,row_i,header_col_j)
     56
     57                cell_value_utf8 = cell_value_unicode.encode("utf-8")
     58
     59##                cell_value_utf8 = unicode(cell_value).encode("utf-8")
    5460
    5561                sanitized_utf8_row.append(cell_value_utf8)
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py

    r33204 r33222  
    66## worksheet_name = "Archaeological reports"
    77
    8 sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",
    9                       "Authority No", "Site No", "Relevant TLA's", "Record Type", "Date Entered" ]
     8sanitized_headers = [ u"Report Identifier", u"Author", u"Title", u"Produced By", u"Date", u"HNZPT Region", u"File No",
     9                      u"Authority No", u"Site No", u"Relevant TLA's", u"Record Type", u"Date Entered" ]
     10
     11def make_unicode(value):
     12    value_unicode = value
     13
     14    if type(value) != unicode:
     15        value_str = value
     16        if type(value_str) != str:
     17            value_str = str(value)
     18       
     19        value_unicode =  value_str.decode('utf-8')
     20
     21    return value_unicode
     22
     23
     24def cell_value_tidy_unicode(worksheet,row_i,col_j):
     25    value= worksheet.cell_value(row_i,col_j)
     26
     27    value_unicode = make_unicode(value)
     28    value_unicode_stripped = value_unicode.strip()
     29
     30    return value_unicode_stripped
     31
    1032
    1133def format_if_int(cell_value):
     
    3658    found_header = False
    3759    for col_j in range(num_header_cols):
    38         header_cell_value = worksheet.cell_value(header_row_pos,col_j)
    39         if header_cell_value == "Report Identifier":
     60        header_cell_value = cell_value_tidy_unicode(worksheet,header_row_pos,col_j)
     61        if header_cell_value == u"Report Identifier":
    4062            found_header = True
    4163
Note: See TracChangeset for help on using the changeset viewer.