Got my paperwork done early, so here's a sample to read from an Excel file
EDIT: Reworked a bit to work with both .xls and .xlsx files
EDIT: Modified to properly cast data types
EDIT: Added handler for formula cell types
EDIT: Added parameters to optionally specify sheet, row, and column numbers
Reading back a file created from my other example:
def excelToDataSet(fileName, hasHeaders = False, sheetNum = 0, firstRow = None, lastRow = None, firstCol = None, lastCol = None):
import org.apache.poi.ss.usermodel.WorkbookFactory as WorkbookFactory
import org.apache.poi.ss.usermodel.DateUtil as DateUtil
from java.io import FileInputStream
from java.util import Date
"""
Function to create a dataset from an Excel spreadsheet. It will try to automatically detect the boundaries of the data,
but helper parameters are available:
params:
fileName - The path to the Excel spreadsheet. (required)
hasHeaders - If true, uses the first row of the spreadsheet as column names.
sheetNum - select the sheet to process. defaults to the first sheet.
firstRow - select first row to process.
lastRow - select last row to process.
firstCol - select first column to process
lastCol - select last column toprocess
"""
fileStream = FileInputStream(fileName)
wb = WorkbookFactory.create(fileStream)
sheet = wb.getSheetAt(sheetNum)
if firstRow is None:
firstRow = sheet.getFirstRowNum()
if lastRow is None:
lastRow = sheet.getLastRowNum()
data = []
for i in range(firstRow , lastRow + 1):
row = sheet.getRow(i)
print str(i).zfill(3), list(row)
if i == firstRow:
if firstCol is None:
firstCol = row.getFirstCellNum()
if lastCol is None:
lastCol = row.getLastCellNum()
else:
# if lastCol is specified add 1 to it.
lastCol += 1
if hasHeaders:
headers = list(row)[firstCol:lastCol]
else:
headers = ['Col'+str(cNum) for cNum in range(firstCol, lastCol)]
rowOut = []
for j in range(firstCol, lastCol):
if i == firstRow and hasHeaders:
pass
else:
cell = row.getCell(j)
cellType = cell.getCellType().toString()
if cellType == 'NUMERIC':
if DateUtil.isCellDateFormatted(cell):
value = cell.dateCellValue
else:
value = cell.getNumericCellValue()
if value == int(value):
value = int(value)
elif cellType == 'STRING':
value = cell.getStringCellValue()
elif cellType == 'BOOLEAN':
value = cell.getBooleanCellValue()
elif cellType == 'BLANK':
value = None
elif cellType == 'FORMULA':
formulatype=str(cell.getCachedFormulaResultType())
if formulatype == 'NUMERIC':
if DateUtil.isCellDateFormatted(cell):
value = cell.dateCellValue
else:
value = cell.getNumericCellValue()
if value == int(value):
value = int(value)
elif formulatype == 'STRING':
value = cell.getStringCellValue()
elif formulatype == 'BOOLEAN':
value = cell.getBooleanCellValue()
elif formulatype == 'BLANK':
value = None
else:
value = None
rowOut.append(value)
if len(rowOut) > 0:
data.append(rowOut)
fileStream.close()
return system.dataset.toDataSet(headers, data)
# Read .xlsx file
fName = 'C:/Test/poi1.xlsx'
ds1 = excelToDataSet(fName, True)
util.printDataSet(ds1)
#Read .xls file
fName = 'C:/Test/poi2.xls'
ds2 = excelToDataSet(fName, True)
util.printDataSet(ds2)
row | t_stamp | sValue | iValue | fValue
-------------------------------------------------------------
0 | Thu Jan 21 08:00:00 EST 2021 | A | 1 | 1.23
1 | Fri Jan 22 08:30:00 EST 2021 | B | 2 | 4.56
2 | Sat Jan 23 09:24:00 EST 2021 | C | 3 | 7.89
3 | Sun Jan 24 19:37:00 EST 2021 | D | 4 | 0.12
4 | Mon Jan 25 18:42:00 EST 2021 | E | 5 | 3.45
row | t_stamp | sValue | iValue | fValue
-------------------------------------------------------------
0 | Thu Jan 21 08:00:00 EST 2021 | Z | 5 | 3.45
1 | Fri Jan 22 08:30:00 EST 2021 | Y | 3 | 0.12
2 | Sat Jan 23 09:24:00 EST 2021 | X | 2 | 7.89
3 | Sun Jan 24 19:37:00 EST 2021 | W | 4 | 4.56
4 | Mon Jan 25 18:42:00 EST 2021 | V | 1 | 1.23