Context Navigation

← Previous Change
Next Change →

Changeset 33188 for main

Timestamp:

2019-06-22T10:10:17+12:00 (5 years ago)

Author:

davidb

Message:

Changes and refactoring to work with the new (XLSX) spreadsheet shared through OneDrive

Location:

main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports

Files:

: 1 added
: 3 edited

PREPARE-IMPORT.sh (modified) (1 diff)
prepare/xls-to-csv.py (modified) (6 diffs)
prepare/xls-to-sanitized-csv.py (modified) (3 diffs)
prepare/xlsutil.py (added)

Legend:

: Unmodified
: Added
: Removed

main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/PREPARE-IMPORT.sh

-              r33029
+              r33188
+python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls
+#python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls
+echo "Generating sanitized CSV version of spreadsheet suitable for download through DL"
+python prepare/xls-to-sanitized-csv.py "import/Archaeological report PDFs 20190620.xlsx"
+echo "Generating PDF-Bound and Unbound CSV files for use in import.pl"
+python prepare/xls-to-csv.py "import/Archaeological report PDFs 20190620.xlsx"

main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

-              r33024
+              r33188
 import os
+import xlsutil
 ## worksheet_name = "Archaeological reports"
-# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
-def format_excelval(book, type, value, wanttupledate):
-    """ Clean up the incoming excel data """
-    ##  Data Type Codes:
-    ##  EMPTY   0
-    ##  TEXT    1 a Unicode string
-    ##  NUMBER  2 float
-    ##  DATE    3 float
-    ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
-    ##  ERROR   5
-    returnrow = []
-    if   type == 2: # TEXT
-        if value == int(value): value = int(value)
-    elif type == 3: # NUMBER
-        datetuple = xlrd.xldate_as_tuple(value, book.datemode)
-        value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
-    elif type == 5: # ERROR
-        value = xlrd.error_text_from_code[value]
-    return value
 letter_to_folder = {
 …
+def fixup_cell(col_num,cell_str):
+    # col == 1 => Author
+    # col == 2 => Title
+    # col == 4 => TLA
+    # col == 5 => AuthorityNo
+    # col == 6 => SiteNo
+    if col_num == 1: #Author
+        cell_str = re.sub('\|?(et al)|(Associates)', '', cell_str)
+    if col_num == 2: #Title
+        # Detect any ALL-CAPS titles, and change to title-case
+        onlyletters = re.sub('[^A-Za-z]+', '', cell_str)
+        if onlyletters.isupper:
+            cell_str = cell_str.title()
+    if col_num == 4: #TLA
+        if cell_str == "All":     cell_str = ""
+        if cell_str == "n/a":     cell_str = ""
+        if cell_str == "various": cell_str = "" # not a meaningful TLA
+    return cell_str
 def excel_to_bound_pdf_csv(excel_filename):
+    workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
+    #workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
+    workbook = xlrd.open_workbook(excel_filename)
     ## worksheet = workbook.sheet_by_name(worksheet_name)
     worksheet = workbook.sheet_by_index(0)
 …
     unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
+    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
+    # 2. Write out Header line as Greenstone friendly Metadata terms
+#    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
+#    # 2. Write out Header line as Greenstone friendly Metadata terms
+#    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
+#    # 3. Process the rest of the file, checking if accompanying
+#    #     PDF file present or not
+    # 1. For header-line, build up hashmap of header-names to column number
+    # 2. Write out header-line as Greenstone friendly Metadata terms
     #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
     # 3. Process the rest of the file, checking if accompanying
+    #     PDF file present or not
+    #     PDF file present or not and only printing out the header-names
+    #     specified in 'sanitized_headers'
     num_rows = worksheet.nrows
     # 1, Skip lines until Header encountered
+    row_i = 0
+    found_header = False
+    while row_i<num_rows:
+        first_cell = worksheet.cell_value(row_i,0);
+        if first_cell == "Report Identifier":
+            found_header = True
+            break
+        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
+        row_i = row_i + 1
+#    row_i = 0
+#    found_header = False
+#    while row_i<num_rows:
+#        first_cell = worksheet.cell_value(row_i,0);
+#        if first_cell == "Report Identifier":
+#            found_header = True
+#            break
+#        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
+#        row_i = row_i + 1
+    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
     # 2. Process Header into Greenstone friendly metadata terms
+    if found_header:
+    if header_names_mapping is None:
+        print "Failed to find metadata label 'Report Identifier' in column 0"
+        exit()
+    else:
         entry_utf8_row = []
+        for entry in worksheet.row_values(row_i):
+            if entry == "Report Identifier":
+                entry = "hnz.Identifier"
+            if entry == "Title":
+                entry = "dc.Title"
+            entry_utf8 = unicode(entry).encode("utf-8")
+            metadata_entry_utf8 = entry_utf8.replace(" ", "")
+            entry_utf8_row.append(metadata_entry_utf8)
+        row_i = 0;
+        for header_name in xlsutil.sanitized_headers:
+            if header_name in header_names_mapping:
+                header_col_j = header_names_mapping[header_name]
+                header_cell_value = worksheet.cell_value(row_i,header_col_j)
+                if header_cell_value == "Report Identifier":
+                    header_cell_value = "hnz.Identifier"
+                if header_cell_value == "Title":
+                    header_cell_value = "dc.Title"
+                if header_cell_value == "RelevantTLA's":
+                    header_cell_value = "TLA"
+                header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8")
+                metadata_name_utf8 = header_cell_value_utf8.replace(" ", "")
+                entry_utf8_row.append(metadata_name_utf8)
+            else:
+                print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
         unbound_wr.writerow(entry_utf8_row)
 …
         entry_utf8_row.insert(0, "Filename")
         pdfbound_wr.writerow(entry_utf8_row)
+        # 3. Process the rest of the file ...
         row_i = row_i + 1
 …
             found_pdf = False
+            for col_j in range(num_cols):
+                cell = worksheet.cell(row_i,col_j)
+                cell_type  = worksheet.cell_type(row_i,col_j)
+                cell_value = worksheet.cell_value(row_i,col_j)
+                formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
+                if col_j == 0:
+                    # Check to see if companion PDF file present
+                    # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
+                    id = formatted_cell.replace(" ","")
+                    pdf_file = id_to_relative_pdf_file(id)
+                    pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
+                    PDF_file = pdf_file_root + ".PDF"
+                    pdf_filename = os.path.join(excel_dirname,pdf_file)
+                    PDF_filename = os.path.join(excel_dirname,PDF_file)
+                    if os.path.exists(pdf_filename):
+                        found_pdf = True
+                        formatted_utf8_row.insert(0, pdf_file)
+                    elif os.path.exists(PDF_filename):
+                        found_pdf = True
+                        formatted_utf8_row.insert(0, PDF_file)
+                    else:
+                        print "Unbound id: '" + id + "'"
+                if isinstance(formatted_cell, basestring):
+                    # Remove any trailing whitespace.
+                    # Newline at end particular harmful for a entry in the CSV file
+                    formatted_cell = formatted_cell.rstrip();
+                formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
+                formatted_utf8_row.append(formatted_cell_utf8)
+#            for col_j in range(num_cols):
+            for header_name in xlsutil.sanitized_headers:
+                if header_name in header_names_mapping:
+                    col_j = header_names_mapping[header_name]
+#                    cell = worksheet.cell(row_i,col_j)
+#                    cell_type  = worksheet.cell_type(row_i,col_j)
+                    cell_value = worksheet.cell_value(row_i,col_j)
+#                    formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False)
+                    formatted_cell_value = xlsutil.format_if_int(cell_value)
+#                    if col_j == 0:
+                    if header_name == "Report Identifier":
+                        # Check to see if companion PDF file present
+                        # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell_value+".pdf")
+                        id = formatted_cell_value.replace(" ","")
+                        pdf_file = id_to_relative_pdf_file(id)
+                        pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
+                        PDF_file = pdf_file_root + ".PDF"
+                        pdf_filename = os.path.join(excel_dirname,pdf_file)
+                        PDF_filename = os.path.join(excel_dirname,PDF_file)
+                        if os.path.exists(pdf_filename):
+                            found_pdf = True
+                            formatted_utf8_row.insert(0, pdf_file)
+                        elif os.path.exists(PDF_filename):
+                            found_pdf = True
+                            formatted_utf8_row.insert(0, PDF_file)
+                        else:
+                            print "Unbound id: '" + id + "'"
+                    if isinstance(formatted_cell_value, basestring):
+                        # Remove any trailing whitespace.
+                        # Newline at end particular harmful for a entry in the CSV file
+                        formatted_cell_value = formatted_cell_value.rstrip();
+                    formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8")
+                    ## Perform any cell transformations to make DL used spreadsheet
+                    ## cleaner to build
+                    # formatted_cell_value_utf8 = fixup_cell(col_j,formatted_cell_value_utf8)
+                    formatted_utf8_row.append(formatted_cell_value_utf8)
+                else:
+                    print "Warning: No column number mapping for header name \""+header_name+"\" => skipping"
             if found_pdf:
                 pdfbound_wr.writerow(formatted_utf8_row)
 …
             row_i = row_i + 1
-    else:
-        print "Failed to find metadata label 'Report Identifier' in column 0"

main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

-              r33174
+              r33188
 import os
+import xlsutil
 ## worksheet_name = "Archaeological reports"
-sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",
-                      "Authority No", "Site No", "Relevant TLA's", "Record Type" ]
 def excel_to_sanitized_csv(excel_filename):
 …
     # 1. For header line, Build up hashmap of header-names to column number
     # 2. Write out CSV file for only the header-names in 'keep_header_names'
+    # 2. Write out CSV file for only the header-names in 'sanitized_headers'
     num_rows = worksheet.nrows
+    header_names_mapping = {}
+#    row_i = 0
+#    num_header_cols = worksheet.row_len(row_i)
+    # 1. For header line, Build up hashmap of header-names to column number
+    # Assume header line is first row of spreadsheet
+    row_i = 0
+    num_header_cols = worksheet.row_len(row_i)
+    # 1. get header-map
+    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
+    found_header = False
+    for col_j in range(num_header_cols):
+        header_cell_value = worksheet.cell_value(row_i,col_j)
+        if header_cell_value == "Report Identifier":
+            found_header = True
+        header_names_mapping[header_cell_value] = col_j
+    if not(found_header):
+#    found_header = False
+#    for col_j in range(num_header_cols):
+#        header_cell_value = worksheet.cell_value(row_i,col_j)
+#        if header_cell_value == "Report Identifier":
+#            found_header = True
+#
+#        header_names_mapping[header_cell_value] = col_j
+#
+#    if not(found_header):
+    if header_names_mapping is None:
         print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)"
         exit(1)
     # Move on to the start of the data values
     row_i = row_i + 1
+    row_i = 1
+    # 2. Write out CSV file ...
     while row_i<num_rows:
         # Work through sanitized header names building up row of utf8 values
         sanitized_utf8_row = []
         for header_name in sanitized_headers:
+        for header_name in xlsutil.sanitized_headers:
             if header_name in header_names_mapping:
 …
                 cell_value = worksheet.cell_value(row_i,header_col_j)
-                #formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
-                # Numbers from worksheet are represented as floating-point type
-                # This causes a problem when it is an 'int' as it then gets
-                # written out as a floating point number (with a '.0')
-                # => test for float and when there is no value after
-                #    the decimal point, explicitly cast to 'int'
-                if type(cell_value) is float:
-                    if cell_value == int(cell_value): cell_value = int(cell_value)
                 cell_value_utf8 = unicode(cell_value).encode("utf-8")

Note: See TracChangeset for help on using the changeset viewer.