Hi! I´m trying to create an app that processes some Excel files uploaded by the user. I have and UploadFile component with an event that sends the file bytes (file_data = event.file.getBytes()) to the following function:
from java.io import ByteArrayInputStream
from org.apache.poi.xssf.usermodel import XSSFWorkbook
from java.util import ArrayList
from java.lang import String
def read_excel_bytes(bytes):
stream = ByteArrayInputStream(bytes)
workbook = XSSFWorkbook(stream)
sheet = workbook.getSheetAt(0) # Hoja 0
headers = []
headerRow = sheet.getRow(0)
for i in range(headerRow.getLastCellNum()):
cell = headerRow.getCell(i)
headers.append(cell.getStringCellValue() if cell else "")
data = []
for rowIndex in range(1, sheet.getLastRowNum() + 1):
row = sheet.getRow(rowIndex)
fila = []
for colIndex in range(len(headers)):
cell = row.getCell(colIndex)
if cell is None:
fila.append("")
else:
tipo = cell.getCellType()
if tipo == "STRING":
fila.append(cell.getStringCellValue())
elif tipo == "NUMERIC":
fila.append(cell.getNumericCellValue())
elif tipo == "BOOLEAN":
fila.append(cell.getBooleanCellValue())
else:
fila.append(str(cell)) # fallback
data.append(fila)
dataset = system.dataset.toDataSet(headers, data)
return dataset
However, when I upload files up to 10k rows the memory usage increases highly. Does someone know an alternative and lightweight method for extracting Excel data in Perspective? Thanks!
You'll want to use the Streaming reader via OPCPackage.open().
Note that you have to accept the data as it comes in and pluck out the parts you are interested in. Potentially sorting afterward to deal with arbitrary ordering.
I haven't done this, but doesn't look like rocket science. It is clearly intended for reduced memory consumption.
It might help to use this API with a temporary file instead of opening the uploaded byte stream.
Thanks for your time, I´ve reached the following implementation creating a tempfile and reading it using OPCPackage and XSSFReader. However, the CPU usage remains quite high (altought it´s more stable than the previous approach), even though the Excel files I´m handling have at most 11k rows (about 1.4 MB size at max).
import tempfile
from java.io import ByteArrayInputStream
from org.apache.poi.openxml4j.opc import OPCPackage
from org.apache.poi.xssf.eventusermodel import XSSFReader
from org.apache.poi.xssf.model import SharedStringsTable
from org.xml.sax import InputSource
from org.xml.sax.helpers import DefaultHandler
from javax.xml.parsers import SAXParserFactory
class Handler(DefaultHandler):
def __init__(self, sst):
self.sst = sst
self.header = None
self.current_row = []
self.text = ""
self.is_string = False
self.reading_value = False
self.row_count = 0
self.results = []
def startElement(self, uri, localName, qName, attrs):
if qName == "row":
self.current_row = []
elif qName == "c":
self.is_string = attrs.getValue("t") == "s" if attrs.getValue("t") else False
elif qName == "v":
self.reading_value = True
self.text = ""
def characters(self, ch, start, length):
if self.reading_value:
self.text += "".join(ch[start:start+length])
def endElement(self, uri, localName, qName):
if qName == "v":
if self.is_string:
idx = int(self.text)
valor = self.sst.getItemAt(idx).getString()
else:
valor = self.text
self.current_row.append(valor)
self.reading_value = False
elif qName == "row":
if self.row_count == 0:
self.header = self.current_row
else:
fila_dict = {}
for i, key in enumerate(self.header):
val = self.current_row[i] if i < len(self.current_row) else None
fila_dict[key] = val
self.results.append(fila_dict)
self.row_count += 1
def read_first_sheet_from_bytes(byte_array):
temp_file = tempfile.NamedTemporaryFile(delete=True)
temp_file.write(byte_array)
temp_file.flush()
pkg = OPCPackage.open(temp_file.name)
results = []
try:
reader = XSSFReader(pkg)
sst = reader.getSharedStringsTable()
sheets = reader.getSheetsData()
if sheets.hasNext():
current_sheet = sheets.next()
parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader()
handler = Handler(sst)
parser.setContentHandler(handler)
parser.parse(InputSource(current_sheet))
current_sheet.close()
results = handler.results
finally:
pkg.close()
temp_file.close()
return results
You are blazing a trail for the rest of us here. Thanks for reporting back.