Changeset 33188 for main

Show
Ignore:
Timestamp:
22.06.2019 10:10:17 (3 months ago)
Author:
davidb
Message:

Changes and refactoring to work with the new (XLSX) spreadsheet shared through OneDrive?

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports
Files:
1 added
3 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/PREPARE-IMPORT.sh

    r33029 r33188  
    22 
    33 
    4 python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls  
     4#python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls  
     5 
     6echo "Generating sanitized CSV version of spreadsheet suitable for download through DL" 
     7python prepare/xls-to-sanitized-csv.py "import/Archaeological report PDFs 20190620.xlsx" 
     8 
     9echo "Generating PDF-Bound and Unbound CSV files for use in import.pl" 
     10python prepare/xls-to-csv.py "import/Archaeological report PDFs 20190620.xlsx" 
     11 
     12 
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33024 r33188  
    33import os 
    44 
     5import xlsutil 
     6 
    57## worksheet_name = "Archaeological reports" 
    6  
    7  
    8 # https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/ 
    9 def format_excelval(book, type, value, wanttupledate): 
    10     """ Clean up the incoming excel data """ 
    11     ##  Data Type Codes: 
    12     ##  EMPTY   0 
    13     ##  TEXT    1 a Unicode string  
    14     ##  NUMBER  2 float  
    15     ##  DATE    3 float  
    16     ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE  
    17     ##  ERROR   5  
    18     returnrow = [] 
    19     if   type == 2: # TEXT 
    20         if value == int(value): value = int(value) 
    21     elif type == 3: # NUMBER 
    22         datetuple = xlrd.xldate_as_tuple(value, book.datemode) 
    23         value = datetuple if wanttupledate else tupledate_to_isodate(datetuple) 
    24     elif type == 5: # ERROR 
    25         value = xlrd.error_text_from_code[value] 
    26     return value 
    27  
    288 
    299letter_to_folder = { 
     
    6848 
    6949 
     50def fixup_cell(col_num,cell_str): 
     51    # col == 1 => Author 
     52    # col == 2 => Title 
     53    # col == 4 => TLA 
     54    # col == 5 => AuthorityNo 
     55    # col == 6 => SiteNo 
     56 
     57    if col_num == 1: #Author 
     58        cell_str = re.sub('\|?(et al)|(Associates)', '', cell_str) 
     59 
     60    if col_num == 2: #Title 
     61        # Detect any ALL-CAPS titles, and change to title-case 
     62        onlyletters = re.sub('[^A-Za-z]+', '', cell_str) 
     63        if onlyletters.isupper: 
     64            cell_str = cell_str.title() 
     65 
     66    if col_num == 4: #TLA 
     67        if cell_str == "All":     cell_str = "" 
     68        if cell_str == "n/a":     cell_str = "" 
     69        if cell_str == "various": cell_str = "" # not a meaningful TLA 
     70 
     71 
     72    return cell_str 
     73 
     74 
    7075def excel_to_bound_pdf_csv(excel_filename): 
    71     workbook = xlrd.open_workbook(excel_filename, formatting_info=True) 
     76    #workbook = xlrd.open_workbook(excel_filename, formatting_info=True) 
     77    workbook = xlrd.open_workbook(excel_filename) 
    7278    ## worksheet = workbook.sheet_by_name(worksheet_name) 
    7379    worksheet = workbook.sheet_by_index(0) 
     
    9096    unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) 
    9197 
    92     # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") 
    93     # 2. Write out Header line as Greenstone friendly Metadata terms  
     98#    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") 
     99#    # 2. Write out Header line as Greenstone friendly Metadata terms  
     100#    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier 
     101#    # 3. Process the rest of the file, checking if accompanying  
     102#    #     PDF file present or not 
     103 
     104 
     105    # 1. For header-line, build up hashmap of header-names to column number 
     106    # 2. Write out header-line as Greenstone friendly Metadata terms  
    94107    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier 
    95108    # 3. Process the rest of the file, checking if accompanying  
    96     #     PDF file present or not 
     109    #     PDF file present or not and only printing out the header-names 
     110    #     specified in 'sanitized_headers' 
     111 
    97112 
    98113    num_rows = worksheet.nrows 
    99114 
    100115    # 1, Skip lines until Header encountered 
    101     row_i = 0 
    102     found_header = False 
    103     while row_i<num_rows: 
    104         first_cell = worksheet.cell_value(row_i,0); 
    105         if first_cell == "Report Identifier": 
    106             found_header = True 
    107             break 
    108         print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 
    109         row_i = row_i + 1 
     116#    row_i = 0 
     117#    found_header = False 
     118#    while row_i<num_rows: 
     119#        first_cell = worksheet.cell_value(row_i,0); 
     120#        if first_cell == "Report Identifier": 
     121#            found_header = True 
     122#            break 
     123#        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 
     124#        row_i = row_i + 1 
     125 
     126    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 
    110127 
    111128    # 2. Process Header into Greenstone friendly metadata terms      
    112     if found_header: 
     129    if header_names_mapping is None: 
     130        print "Failed to find metadata label 'Report Identifier' in column 0" 
     131        exit() 
     132    else: 
    113133        entry_utf8_row = [] 
    114         for entry in worksheet.row_values(row_i): 
    115             if entry == "Report Identifier": 
    116                 entry = "hnz.Identifier" 
    117  
    118             if entry == "Title": 
    119                 entry = "dc.Title" 
    120  
    121             entry_utf8 = unicode(entry).encode("utf-8") 
    122             metadata_entry_utf8 = entry_utf8.replace(" ", "") 
    123  
    124             entry_utf8_row.append(metadata_entry_utf8) 
     134        row_i = 0; 
     135 
     136        for header_name in xlsutil.sanitized_headers: 
     137 
     138            if header_name in header_names_mapping: 
     139                header_col_j = header_names_mapping[header_name] 
     140 
     141                header_cell_value = worksheet.cell_value(row_i,header_col_j) 
     142                 
     143                if header_cell_value == "Report Identifier": 
     144                    header_cell_value = "hnz.Identifier" 
     145 
     146                if header_cell_value == "Title": 
     147                    header_cell_value = "dc.Title" 
     148 
     149                if header_cell_value == "RelevantTLA's": 
     150                    header_cell_value = "TLA" 
     151 
     152                header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8") 
     153                metadata_name_utf8 = header_cell_value_utf8.replace(" ", "") 
     154 
     155                entry_utf8_row.append(metadata_name_utf8) 
     156 
     157            else: 
     158                print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping") 
     159 
    125160        unbound_wr.writerow(entry_utf8_row) 
    126161 
     
    128163        entry_utf8_row.insert(0, "Filename") 
    129164        pdfbound_wr.writerow(entry_utf8_row) 
    130  
     165                       
     166        # 3. Process the rest of the file ... 
    131167 
    132168        row_i = row_i + 1 
     
    137173            found_pdf = False 
    138174 
    139             for col_j in range(num_cols): 
    140                 cell = worksheet.cell(row_i,col_j) 
    141  
    142                 cell_type  = worksheet.cell_type(row_i,col_j) 
    143                 cell_value = worksheet.cell_value(row_i,col_j) 
    144  
    145                 formatted_cell = format_excelval(workbook,cell_type,cell_value,False) 
    146  
    147                 if col_j == 0: 
    148                     # Check to see if companion PDF file present 
    149                     # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf") 
    150                     id = formatted_cell.replace(" ","") 
    151  
    152                     pdf_file = id_to_relative_pdf_file(id) 
    153                     pdf_file_root, pdf_ext = os.path.splitext(pdf_file) 
    154                     PDF_file = pdf_file_root + ".PDF" 
    155  
    156                     pdf_filename = os.path.join(excel_dirname,pdf_file) 
    157                     PDF_filename = os.path.join(excel_dirname,PDF_file) 
    158  
    159                     if os.path.exists(pdf_filename): 
    160                         found_pdf = True 
    161                         formatted_utf8_row.insert(0, pdf_file) 
    162                     elif os.path.exists(PDF_filename): 
    163                         found_pdf = True 
    164                         formatted_utf8_row.insert(0, PDF_file) 
    165                     else: 
    166                         print "Unbound id: '" + id + "'" 
    167  
    168                 if isinstance(formatted_cell, basestring): 
    169                     # Remove any trailing whitespace.   
    170                     # Newline at end particular harmful for a entry in the CSV file 
    171                     formatted_cell = formatted_cell.rstrip(); 
    172  
    173                 formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8") 
    174                 formatted_utf8_row.append(formatted_cell_utf8) 
     175#            for col_j in range(num_cols): 
     176            for header_name in xlsutil.sanitized_headers: 
     177 
     178                if header_name in header_names_mapping: 
     179                    col_j = header_names_mapping[header_name] 
     180 
     181#                    cell = worksheet.cell(row_i,col_j) 
     182 
     183#                    cell_type  = worksheet.cell_type(row_i,col_j) 
     184                    cell_value = worksheet.cell_value(row_i,col_j) 
     185 
     186#                    formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False) 
     187                    formatted_cell_value = xlsutil.format_if_int(cell_value) 
     188 
     189#                    if col_j == 0: 
     190                    if header_name == "Report Identifier": 
     191                        # Check to see if companion PDF file present 
     192                        # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell_value+".pdf") 
     193                        id = formatted_cell_value.replace(" ","") 
     194                         
     195                        pdf_file = id_to_relative_pdf_file(id) 
     196                        pdf_file_root, pdf_ext = os.path.splitext(pdf_file) 
     197                        PDF_file = pdf_file_root + ".PDF" 
     198                     
     199                        pdf_filename = os.path.join(excel_dirname,pdf_file) 
     200                        PDF_filename = os.path.join(excel_dirname,PDF_file) 
     201 
     202                        if os.path.exists(pdf_filename): 
     203                            found_pdf = True 
     204                            formatted_utf8_row.insert(0, pdf_file) 
     205                        elif os.path.exists(PDF_filename): 
     206                            found_pdf = True 
     207                            formatted_utf8_row.insert(0, PDF_file) 
     208                        else: 
     209                            print "Unbound id: '" + id + "'" 
     210 
     211                    if isinstance(formatted_cell_value, basestring): 
     212                        # Remove any trailing whitespace.   
     213                        # Newline at end particular harmful for a entry in the CSV file 
     214                        formatted_cell_value = formatted_cell_value.rstrip(); 
     215 
     216                    formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8") 
     217 
     218                    ## Perform any cell transformations to make DL used spreadsheet 
     219                    ## cleaner to build 
     220                    # formatted_cell_value_utf8 = fixup_cell(col_j,formatted_cell_value_utf8) 
     221 
     222                    formatted_utf8_row.append(formatted_cell_value_utf8) 
     223                else: 
     224                    print "Warning: No column number mapping for header name \""+header_name+"\" => skipping" 
     225 
    175226            if found_pdf: 
    176227                pdfbound_wr.writerow(formatted_utf8_row) 
     
    179230 
    180231            row_i = row_i + 1 
    181     else: 
    182         print "Failed to find metadata label 'Report Identifier' in column 0" 
    183232 
    184233 
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33174 r33188  
    33import os 
    44 
     5import xlsutil 
     6 
    57## worksheet_name = "Archaeological reports" 
    68 
    7 sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No", 
    8                       "Authority No", "Site No", "Relevant TLA's", "Record Type" ] 
    99 
    1010def excel_to_sanitized_csv(excel_filename): 
     
    2727 
    2828    # 1. For header line, Build up hashmap of header-names to column number 
    29     # 2. Write out CSV file for only the header-names in 'keep_header_names' 
     29    # 2. Write out CSV file for only the header-names in 'sanitized_headers' 
    3030 
    3131    num_rows = worksheet.nrows 
    3232 
    33     header_names_mapping = {} 
     33#    row_i = 0 
     34#    num_header_cols = worksheet.row_len(row_i) 
    3435 
    35     # 1. For header line, Build up hashmap of header-names to column number 
    36     # Assume header line is first row of spreadsheet 
    37     row_i = 0 
    38     num_header_cols = worksheet.row_len(row_i) 
     36    # 1. get header-map 
     37    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 
    3938 
    40     found_header = False 
    41     for col_j in range(num_header_cols): 
    42         header_cell_value = worksheet.cell_value(row_i,col_j) 
    43         if header_cell_value == "Report Identifier": 
    44             found_header = True 
    45  
    46         header_names_mapping[header_cell_value] = col_j 
    47  
    48     if not(found_header): 
     39#    found_header = False 
     40#    for col_j in range(num_header_cols): 
     41#        header_cell_value = worksheet.cell_value(row_i,col_j) 
     42#        if header_cell_value == "Report Identifier": 
     43#            found_header = True 
     44# 
     45#        header_names_mapping[header_cell_value] = col_j 
     46# 
     47#    if not(found_header): 
     48    if header_names_mapping is None: 
    4949        print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)" 
    5050        exit(1) 
    5151 
    5252    # Move on to the start of the data values 
    53     row_i = row_i + 1 
     53    row_i = 1 
    5454 
     55    # 2. Write out CSV file ... 
    5556    while row_i<num_rows: 
    5657        # Work through sanitized header names building up row of utf8 values 
    5758        sanitized_utf8_row = [] 
    5859 
    59         for header_name in sanitized_headers: 
     60        for header_name in xlsutil.sanitized_headers: 
    6061 
    6162            if header_name in header_names_mapping: 
     
    6465                cell_value = worksheet.cell_value(row_i,header_col_j) 
    6566 
    66                 #formatted_cell = format_excelval(workbook,cell_type,cell_value,False) 
    6767 
    68                 # Numbers from worksheet are represented as floating-point type 
    69                 # This causes a problem when it is an 'int' as it then gets 
    70                 # written out as a floating point number (with a '.0') 
    71                 # => test for float and when there is no value after  
    72                 #    the decimal point, explicitly cast to 'int' 
    73                 if type(cell_value) is float: 
    74                     if cell_value == int(cell_value): cell_value = int(cell_value) 
    7568 
    7669                cell_value_utf8 = unicode(cell_value).encode("utf-8")